1# Configurable Element class lookup
2
3################################################################################
4# Custom Element classes
5
6cdef public class ElementBase(_Element) [ type LxmlElementBaseType,
7                                          object LxmlElementBase ]:
8    u"""ElementBase(*children, attrib=None, nsmap=None, **_extra)
9
10    The public Element class.  All custom Element classes must inherit
11    from this one.  To create an Element, use the `Element()` factory.
12
13    BIG FAT WARNING: Subclasses *must not* override __init__ or
14    __new__ as it is absolutely undefined when these objects will be
15    created or destroyed.  All persistent state of Elements must be
16    stored in the underlying XML.  If you really need to initialize
17    the object after creation, you can implement an ``_init(self)``
18    method that will be called directly after object creation.
19
20    Subclasses of this class can be instantiated to create a new
21    Element.  By default, the tag name will be the class name and the
22    namespace will be empty.  You can modify this with the following
23    class attributes:
24
25    * TAG - the tag name, possibly containing a namespace in Clark
26      notation
27
28    * NAMESPACE - the default namespace URI, unless provided as part
29      of the TAG attribute.
30
31    * HTML - flag if the class is an HTML tag, as opposed to an XML
32      tag.  This only applies to un-namespaced tags and defaults to
33      false (i.e. XML).
34
35    * PARSER - the parser that provides the configuration for the
36      newly created document.  Providing an HTML parser here will
37      default to creating an HTML element.
38
39    In user code, the latter three are commonly inherited in class
40    hierarchies that implement a common namespace.
41    """
42    def __init__(self, *children, attrib=None, nsmap=None, **_extra):
43        u"""ElementBase(*children, attrib=None, nsmap=None, **_extra)
44        """
45        cdef bint is_html = 0
46        cdef _BaseParser parser
47        cdef _Element last_child
48        # don't use normal attribute access as it might be overridden
49        _getattr = object.__getattribute__
50        try:
51            namespace = _utf8(_getattr(self, 'NAMESPACE'))
52        except AttributeError:
53            namespace = None
54        try:
55            ns, tag = _getNsTag(_getattr(self, 'TAG'))
56            if ns is not None:
57                namespace = ns
58        except AttributeError:
59            tag = _utf8(_getattr(_getattr(self, '__class__'), '__name__'))
60            if b'.' in tag:
61                tag = tag.split(b'.')[-1]
62        try:
63            parser = _getattr(self, 'PARSER')
64        except AttributeError:
65            parser = None
66            for child in children:
67                if isinstance(child, _Element):
68                    parser = (<_Element>child)._doc._parser
69                    break
70        if isinstance(parser, HTMLParser):
71            is_html = 1
72        if namespace is None:
73            try:
74                is_html = _getattr(self, 'HTML')
75            except AttributeError:
76                pass
77        _initNewElement(self, is_html, tag, namespace, parser,
78                        attrib, nsmap, _extra)
79        last_child = None
80        for child in children:
81            if _isString(child):
82                if last_child is None:
83                    _setNodeText(self._c_node,
84                                 (_collectText(self._c_node.children) or '') + child)
85                else:
86                    _setTailText(last_child._c_node,
87                                 (_collectText(last_child._c_node.next) or '') + child)
88            elif isinstance(child, _Element):
89                last_child = child
90                _appendChild(self, last_child)
91            elif isinstance(child, type) and issubclass(child, ElementBase):
92                last_child = child()
93                _appendChild(self, last_child)
94            else:
95                raise TypeError, f"Invalid child type: {type(child)!r}"
96
97cdef class CommentBase(_Comment):
98    u"""All custom Comment classes must inherit from this one.
99
100    To create an XML Comment instance, use the ``Comment()`` factory.
101
102    Subclasses *must not* override __init__ or __new__ as it is
103    absolutely undefined when these objects will be created or
104    destroyed.  All persistent state of Comments must be stored in the
105    underlying XML.  If you really need to initialize the object after
106    creation, you can implement an ``_init(self)`` method that will be
107    called after object creation.
108    """
109    def __init__(self, text):
110        # copied from Comment() factory
111        cdef _Document doc
112        cdef xmlDoc*   c_doc
113        if text is None:
114            text = b''
115        else:
116            text = _utf8(text)
117        c_doc = _newXMLDoc()
118        doc = _documentFactory(c_doc, None)
119        self._c_node = _createComment(c_doc, _xcstr(text))
120        if self._c_node is NULL:
121            raise MemoryError()
122        tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
123        _registerProxy(self, doc, self._c_node)
124        self._init()
125
126cdef class PIBase(_ProcessingInstruction):
127    u"""All custom Processing Instruction classes must inherit from this one.
128
129    To create an XML ProcessingInstruction instance, use the ``PI()``
130    factory.
131
132    Subclasses *must not* override __init__ or __new__ as it is
133    absolutely undefined when these objects will be created or
134    destroyed.  All persistent state of PIs must be stored in the
135    underlying XML.  If you really need to initialize the object after
136    creation, you can implement an ``_init(self)`` method that will be
137    called after object creation.
138    """
139    def __init__(self, target, text=None):
140        # copied from PI() factory
141        cdef _Document doc
142        cdef xmlDoc*   c_doc
143        target = _utf8(target)
144        if text is None:
145            text = b''
146        else:
147            text = _utf8(text)
148        c_doc = _newXMLDoc()
149        doc = _documentFactory(c_doc, None)
150        self._c_node = _createPI(c_doc, _xcstr(target), _xcstr(text))
151        if self._c_node is NULL:
152            raise MemoryError()
153        tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
154        _registerProxy(self, doc, self._c_node)
155        self._init()
156
157cdef class EntityBase(_Entity):
158    u"""All custom Entity classes must inherit from this one.
159
160    To create an XML Entity instance, use the ``Entity()`` factory.
161
162    Subclasses *must not* override __init__ or __new__ as it is
163    absolutely undefined when these objects will be created or
164    destroyed.  All persistent state of Entities must be stored in the
165    underlying XML.  If you really need to initialize the object after
166    creation, you can implement an ``_init(self)`` method that will be
167    called after object creation.
168    """
169    def __init__(self, name):
170        cdef _Document doc
171        cdef xmlDoc*   c_doc
172        name_utf = _utf8(name)
173        c_name = _xcstr(name_utf)
174        if c_name[0] == c'#':
175            if not _characterReferenceIsValid(c_name + 1):
176                raise ValueError, f"Invalid character reference: '{name}'"
177        elif not _xmlNameIsValid(c_name):
178            raise ValueError, f"Invalid entity reference: '{name}'"
179        c_doc = _newXMLDoc()
180        doc = _documentFactory(c_doc, None)
181        self._c_node = _createEntity(c_doc, c_name)
182        if self._c_node is NULL:
183            raise MemoryError()
184        tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
185        _registerProxy(self, doc, self._c_node)
186        self._init()
187
188
189cdef int _validateNodeClass(xmlNode* c_node, cls) except -1:
190    if c_node.type == tree.XML_ELEMENT_NODE:
191        expected = ElementBase
192    elif c_node.type == tree.XML_COMMENT_NODE:
193        expected = CommentBase
194    elif c_node.type == tree.XML_ENTITY_REF_NODE:
195        expected = EntityBase
196    elif c_node.type == tree.XML_PI_NODE:
197        expected = PIBase
198    else:
199        assert False, f"Unknown node type: {c_node.type}"
200
201    if not (isinstance(cls, type) and issubclass(cls, expected)):
202        raise TypeError(
203            f"result of class lookup must be subclass of {type(expected)}, got {type(cls)}")
204    return 0
205
206
207################################################################################
208# Element class lookup
209
210ctypedef public object (*_element_class_lookup_function)(object, _Document, xmlNode*)
211
212# class to store element class lookup functions
213cdef public class ElementClassLookup [ type LxmlElementClassLookupType,
214                                       object LxmlElementClassLookup ]:
215    u"""ElementClassLookup(self)
216    Superclass of Element class lookups.
217    """
218    cdef _element_class_lookup_function _lookup_function
219
220
221cdef public class FallbackElementClassLookup(ElementClassLookup) \
222         [ type LxmlFallbackElementClassLookupType,
223           object LxmlFallbackElementClassLookup ]:
224    u"""FallbackElementClassLookup(self, fallback=None)
225
226    Superclass of Element class lookups with additional fallback.
227    """
228    cdef readonly ElementClassLookup fallback
229    cdef _element_class_lookup_function _fallback_function
230    def __cinit__(self):
231        # fall back to default lookup
232        self._fallback_function = _lookupDefaultElementClass
233
234    def __init__(self, ElementClassLookup fallback=None):
235        if fallback is not None:
236            self._setFallback(fallback)
237        else:
238            self._fallback_function = _lookupDefaultElementClass
239
240    cdef void _setFallback(self, ElementClassLookup lookup):
241        u"""Sets the fallback scheme for this lookup method.
242        """
243        self.fallback = lookup
244        self._fallback_function = lookup._lookup_function
245        if self._fallback_function is NULL:
246            self._fallback_function = _lookupDefaultElementClass
247
248    def set_fallback(self, ElementClassLookup lookup not None):
249        u"""set_fallback(self, lookup)
250
251        Sets the fallback scheme for this lookup method.
252        """
253        self._setFallback(lookup)
254
255cdef inline object _callLookupFallback(FallbackElementClassLookup lookup,
256                                       _Document doc, xmlNode* c_node):
257    return lookup._fallback_function(lookup.fallback, doc, c_node)
258
259
260################################################################################
261# default lookup scheme
262
263cdef class ElementDefaultClassLookup(ElementClassLookup):
264    u"""ElementDefaultClassLookup(self, element=None, comment=None, pi=None, entity=None)
265    Element class lookup scheme that always returns the default Element
266    class.
267
268    The keyword arguments ``element``, ``comment``, ``pi`` and ``entity``
269    accept the respective Element classes.
270    """
271    cdef readonly object element_class
272    cdef readonly object comment_class
273    cdef readonly object pi_class
274    cdef readonly object entity_class
275    def __cinit__(self):
276        self._lookup_function = _lookupDefaultElementClass
277
278    def __init__(self, element=None, comment=None, pi=None, entity=None):
279        if element is None:
280            self.element_class = _Element
281        elif issubclass(element, ElementBase):
282            self.element_class = element
283        else:
284            raise TypeError, u"element class must be subclass of ElementBase"
285
286        if comment is None:
287            self.comment_class = _Comment
288        elif issubclass(comment, CommentBase):
289            self.comment_class = comment
290        else:
291            raise TypeError, u"comment class must be subclass of CommentBase"
292
293        if entity is None:
294            self.entity_class = _Entity
295        elif issubclass(entity, EntityBase):
296            self.entity_class = entity
297        else:
298            raise TypeError, u"Entity class must be subclass of EntityBase"
299
300        if pi is None:
301            self.pi_class = None # special case, see below
302        elif issubclass(pi, PIBase):
303            self.pi_class = pi
304        else:
305            raise TypeError, u"PI class must be subclass of PIBase"
306
307cdef object _lookupDefaultElementClass(state, _Document _doc, xmlNode* c_node):
308    u"Trivial class lookup function that always returns the default class."
309    if c_node.type == tree.XML_ELEMENT_NODE:
310        if state is not None:
311            return (<ElementDefaultClassLookup>state).element_class
312        else:
313            return _Element
314    elif c_node.type == tree.XML_COMMENT_NODE:
315        if state is not None:
316            return (<ElementDefaultClassLookup>state).comment_class
317        else:
318            return _Comment
319    elif c_node.type == tree.XML_ENTITY_REF_NODE:
320        if state is not None:
321            return (<ElementDefaultClassLookup>state).entity_class
322        else:
323            return _Entity
324    elif c_node.type == tree.XML_PI_NODE:
325        if state is None or (<ElementDefaultClassLookup>state).pi_class is None:
326            # special case XSLT-PI
327            if c_node.name is not NULL and c_node.content is not NULL:
328                if tree.xmlStrcmp(c_node.name, <unsigned char*>"xml-stylesheet") == 0:
329                    if tree.xmlStrstr(c_node.content, <unsigned char*>"text/xsl") is not NULL or \
330                           tree.xmlStrstr(c_node.content, <unsigned char*>"text/xml") is not NULL:
331                        return _XSLTProcessingInstruction
332            return _ProcessingInstruction
333        else:
334            return (<ElementDefaultClassLookup>state).pi_class
335    else:
336        assert False, f"Unknown node type: {c_node.type}"
337
338
339################################################################################
340# attribute based lookup scheme
341
342cdef class AttributeBasedElementClassLookup(FallbackElementClassLookup):
343    u"""AttributeBasedElementClassLookup(self, attribute_name, class_mapping, fallback=None)
344    Checks an attribute of an Element and looks up the value in a
345    class dictionary.
346
347    Arguments:
348      - attribute name - '{ns}name' style string
349      - class mapping  - Python dict mapping attribute values to Element classes
350      - fallback       - optional fallback lookup mechanism
351
352    A None key in the class mapping will be checked if the attribute is
353    missing.
354    """
355    cdef object _class_mapping
356    cdef tuple _pytag
357    cdef const_xmlChar* _c_ns
358    cdef const_xmlChar* _c_name
359    def __cinit__(self):
360        self._lookup_function = _attribute_class_lookup
361
362    def __init__(self, attribute_name, class_mapping,
363                 ElementClassLookup fallback=None):
364        self._pytag = _getNsTag(attribute_name)
365        ns, name = self._pytag
366        if ns is None:
367            self._c_ns = NULL
368        else:
369            self._c_ns = _xcstr(ns)
370        self._c_name = _xcstr(name)
371        self._class_mapping = dict(class_mapping)
372
373        FallbackElementClassLookup.__init__(self, fallback)
374
375cdef object _attribute_class_lookup(state, _Document doc, xmlNode* c_node):
376    cdef AttributeBasedElementClassLookup lookup
377    cdef python.PyObject* dict_result
378
379    lookup = <AttributeBasedElementClassLookup>state
380    if c_node.type == tree.XML_ELEMENT_NODE:
381        value = _attributeValueFromNsName(
382            c_node, lookup._c_ns, lookup._c_name)
383        dict_result = python.PyDict_GetItem(lookup._class_mapping, value)
384        if dict_result is not NULL:
385            cls = <object>dict_result
386            _validateNodeClass(c_node, cls)
387            return cls
388    return _callLookupFallback(lookup, doc, c_node)
389
390
391################################################################################
392#  per-parser lookup scheme
393
394cdef class ParserBasedElementClassLookup(FallbackElementClassLookup):
395    u"""ParserBasedElementClassLookup(self, fallback=None)
396    Element class lookup based on the XML parser.
397    """
398    def __cinit__(self):
399        self._lookup_function = _parser_class_lookup
400
401cdef object _parser_class_lookup(state, _Document doc, xmlNode* c_node):
402    if doc._parser._class_lookup is not None:
403        return doc._parser._class_lookup._lookup_function(
404            doc._parser._class_lookup, doc, c_node)
405    return _callLookupFallback(<FallbackElementClassLookup>state, doc, c_node)
406
407
408################################################################################
409#  custom class lookup based on node type, namespace, name
410
411cdef class CustomElementClassLookup(FallbackElementClassLookup):
412    u"""CustomElementClassLookup(self, fallback=None)
413    Element class lookup based on a subclass method.
414
415    You can inherit from this class and override the method::
416
417        lookup(self, type, doc, namespace, name)
418
419    to lookup the element class for a node. Arguments of the method:
420    * type:      one of 'element', 'comment', 'PI', 'entity'
421    * doc:       document that the node is in
422    * namespace: namespace URI of the node (or None for comments/PIs/entities)
423    * name:      name of the element/entity, None for comments, target for PIs
424
425    If you return None from this method, the fallback will be called.
426    """
427    def __cinit__(self):
428        self._lookup_function = _custom_class_lookup
429
430    def lookup(self, type, doc, namespace, name):
431        u"lookup(self, type, doc, namespace, name)"
432        return None
433
434cdef object _custom_class_lookup(state, _Document doc, xmlNode* c_node):
435    cdef CustomElementClassLookup lookup
436
437    lookup = <CustomElementClassLookup>state
438
439    if c_node.type == tree.XML_ELEMENT_NODE:
440        element_type = u"element"
441    elif c_node.type == tree.XML_COMMENT_NODE:
442        element_type = u"comment"
443    elif c_node.type == tree.XML_PI_NODE:
444        element_type = u"PI"
445    elif c_node.type == tree.XML_ENTITY_REF_NODE:
446        element_type = u"entity"
447    else:
448        element_type = u"element"
449    if c_node.name is NULL:
450        name = None
451    else:
452        name = funicode(c_node.name)
453    c_str = tree._getNs(c_node)
454    ns = funicode(c_str) if c_str is not NULL else None
455
456    cls = lookup.lookup(element_type, doc, ns, name)
457    if cls is not None:
458        _validateNodeClass(c_node, cls)
459        return cls
460    return _callLookupFallback(lookup, doc, c_node)
461
462
463################################################################################
464# read-only tree based class lookup
465
466cdef class PythonElementClassLookup(FallbackElementClassLookup):
467    u"""PythonElementClassLookup(self, fallback=None)
468    Element class lookup based on a subclass method.
469
470    This class lookup scheme allows access to the entire XML tree in
471    read-only mode.  To use it, re-implement the ``lookup(self, doc,
472    root)`` method in a subclass::
473
474        from lxml import etree, pyclasslookup
475
476        class MyElementClass(etree.ElementBase):
477            honkey = True
478
479        class MyLookup(pyclasslookup.PythonElementClassLookup):
480            def lookup(self, doc, root):
481                if root.tag == "sometag":
482                    return MyElementClass
483                else:
484                    for child in root:
485                        if child.tag == "someothertag":
486                            return MyElementClass
487                # delegate to default
488                return None
489
490    If you return None from this method, the fallback will be called.
491
492    The first argument is the opaque document instance that contains
493    the Element.  The second argument is a lightweight Element proxy
494    implementation that is only valid during the lookup.  Do not try
495    to keep a reference to it.  Once the lookup is done, the proxy
496    will be invalid.
497
498    Also, you cannot wrap such a read-only Element in an ElementTree,
499    and you must take care not to keep a reference to them outside of
500    the `lookup()` method.
501
502    Note that the API of the Element objects is not complete.  It is
503    purely read-only and does not support all features of the normal
504    `lxml.etree` API (such as XPath, extended slicing or some
505    iteration methods).
506
507    See https://lxml.de/element_classes.html
508    """
509    def __cinit__(self):
510        self._lookup_function = _python_class_lookup
511
512    def lookup(self, doc, element):
513        u"""lookup(self, doc, element)
514
515        Override this method to implement your own lookup scheme.
516        """
517        return None
518
519cdef object _python_class_lookup(state, _Document doc, tree.xmlNode* c_node):
520    cdef PythonElementClassLookup lookup
521    cdef _ReadOnlyProxy proxy
522    lookup = <PythonElementClassLookup>state
523
524    proxy = _newReadOnlyProxy(None, c_node)
525    cls = lookup.lookup(doc, proxy)
526    _freeReadOnlyProxies(proxy)
527
528    if cls is not None:
529        _validateNodeClass(c_node, cls)
530        return cls
531    return _callLookupFallback(lookup, doc, c_node)
532
533################################################################################
534# Global setup
535
536cdef _element_class_lookup_function LOOKUP_ELEMENT_CLASS
537cdef object ELEMENT_CLASS_LOOKUP_STATE
538
539cdef void _setElementClassLookupFunction(
540    _element_class_lookup_function function, object state):
541    global LOOKUP_ELEMENT_CLASS, ELEMENT_CLASS_LOOKUP_STATE
542    if function is NULL:
543        state    = DEFAULT_ELEMENT_CLASS_LOOKUP
544        function = DEFAULT_ELEMENT_CLASS_LOOKUP._lookup_function
545
546    ELEMENT_CLASS_LOOKUP_STATE = state
547    LOOKUP_ELEMENT_CLASS = function
548
549def set_element_class_lookup(ElementClassLookup lookup = None):
550    u"""set_element_class_lookup(lookup = None)
551
552    Set the global default element class lookup method.
553    """
554    if lookup is None or lookup._lookup_function is NULL:
555        _setElementClassLookupFunction(NULL, None)
556    else:
557        _setElementClassLookupFunction(lookup._lookup_function, lookup)
558
559# default setup: parser delegation
560cdef ParserBasedElementClassLookup DEFAULT_ELEMENT_CLASS_LOOKUP
561DEFAULT_ELEMENT_CLASS_LOOKUP = ParserBasedElementClassLookup()
562
563set_element_class_lookup(DEFAULT_ELEMENT_CLASS_LOOKUP)
564