1from __future__ import absolute_import, division, unicode_literals
2
3from xml.dom import Node
4from ..constants import namespaces, voidElements, spaceCharacters
5
6__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
7           "TreeWalker", "NonRecursiveTreeWalker"]
8
9DOCUMENT = Node.DOCUMENT_NODE
10DOCTYPE = Node.DOCUMENT_TYPE_NODE
11TEXT = Node.TEXT_NODE
12ELEMENT = Node.ELEMENT_NODE
13COMMENT = Node.COMMENT_NODE
14ENTITY = Node.ENTITY_NODE
15UNKNOWN = "<#UNKNOWN#>"
16
17spaceCharacters = "".join(spaceCharacters)
18
19
20class TreeWalker(object):
21    """Walks a tree yielding tokens
22
23    Tokens are dicts that all have a ``type`` field specifying the type of the
24    token.
25
26    """
27    def __init__(self, tree):
28        """Creates a TreeWalker
29
30        :arg tree: the tree to walk
31
32        """
33        self.tree = tree
34
35    def __iter__(self):
36        raise NotImplementedError
37
38    def error(self, msg):
39        """Generates an error token with the given message
40
41        :arg msg: the error message
42
43        :returns: SerializeError token
44
45        """
46        return {"type": "SerializeError", "data": msg}
47
48    def emptyTag(self, namespace, name, attrs, hasChildren=False):
49        """Generates an EmptyTag token
50
51        :arg namespace: the namespace of the token--can be ``None``
52
53        :arg name: the name of the element
54
55        :arg attrs: the attributes of the element as a dict
56
57        :arg hasChildren: whether or not to yield a SerializationError because
58            this tag shouldn't have children
59
60        :returns: EmptyTag token
61
62        """
63        yield {"type": "EmptyTag", "name": name,
64               "namespace": namespace,
65               "data": attrs}
66        if hasChildren:
67            yield self.error("Void element has children")
68
69    def startTag(self, namespace, name, attrs):
70        """Generates a StartTag token
71
72        :arg namespace: the namespace of the token--can be ``None``
73
74        :arg name: the name of the element
75
76        :arg attrs: the attributes of the element as a dict
77
78        :returns: StartTag token
79
80        """
81        return {"type": "StartTag",
82                "name": name,
83                "namespace": namespace,
84                "data": attrs}
85
86    def endTag(self, namespace, name):
87        """Generates an EndTag token
88
89        :arg namespace: the namespace of the token--can be ``None``
90
91        :arg name: the name of the element
92
93        :returns: EndTag token
94
95        """
96        return {"type": "EndTag",
97                "name": name,
98                "namespace": namespace}
99
100    def text(self, data):
101        """Generates SpaceCharacters and Characters tokens
102
103        Depending on what's in the data, this generates one or more
104        ``SpaceCharacters`` and ``Characters`` tokens.
105
106        For example:
107
108            >>> from html5lib.treewalkers.base import TreeWalker
109            >>> # Give it an empty tree just so it instantiates
110            >>> walker = TreeWalker([])
111            >>> list(walker.text(''))
112            []
113            >>> list(walker.text('  '))
114            [{u'data': '  ', u'type': u'SpaceCharacters'}]
115            >>> list(walker.text(' abc '))  # doctest: +NORMALIZE_WHITESPACE
116            [{u'data': ' ', u'type': u'SpaceCharacters'},
117            {u'data': u'abc', u'type': u'Characters'},
118            {u'data': u' ', u'type': u'SpaceCharacters'}]
119
120        :arg data: the text data
121
122        :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
123
124        """
125        data = data
126        middle = data.lstrip(spaceCharacters)
127        left = data[:len(data) - len(middle)]
128        if left:
129            yield {"type": "SpaceCharacters", "data": left}
130        data = middle
131        middle = data.rstrip(spaceCharacters)
132        right = data[len(middle):]
133        if middle:
134            yield {"type": "Characters", "data": middle}
135        if right:
136            yield {"type": "SpaceCharacters", "data": right}
137
138    def comment(self, data):
139        """Generates a Comment token
140
141        :arg data: the comment
142
143        :returns: Comment token
144
145        """
146        return {"type": "Comment", "data": data}
147
148    def doctype(self, name, publicId=None, systemId=None):
149        """Generates a Doctype token
150
151        :arg name:
152
153        :arg publicId:
154
155        :arg systemId:
156
157        :returns: the Doctype token
158
159        """
160        return {"type": "Doctype",
161                "name": name,
162                "publicId": publicId,
163                "systemId": systemId}
164
165    def entity(self, name):
166        """Generates an Entity token
167
168        :arg name: the entity name
169
170        :returns: an Entity token
171
172        """
173        return {"type": "Entity", "name": name}
174
175    def unknown(self, nodeType):
176        """Handles unknown node types"""
177        return self.error("Unknown node type: " + nodeType)
178
179
180class NonRecursiveTreeWalker(TreeWalker):
181    def getNodeDetails(self, node):
182        raise NotImplementedError
183
184    def getFirstChild(self, node):
185        raise NotImplementedError
186
187    def getNextSibling(self, node):
188        raise NotImplementedError
189
190    def getParentNode(self, node):
191        raise NotImplementedError
192
193    def __iter__(self):
194        currentNode = self.tree
195        while currentNode is not None:
196            details = self.getNodeDetails(currentNode)
197            type, details = details[0], details[1:]
198            hasChildren = False
199
200            if type == DOCTYPE:
201                yield self.doctype(*details)
202
203            elif type == TEXT:
204                for token in self.text(*details):
205                    yield token
206
207            elif type == ELEMENT:
208                namespace, name, attributes, hasChildren = details
209                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
210                    for token in self.emptyTag(namespace, name, attributes,
211                                               hasChildren):
212                        yield token
213                    hasChildren = False
214                else:
215                    yield self.startTag(namespace, name, attributes)
216
217            elif type == COMMENT:
218                yield self.comment(details[0])
219
220            elif type == ENTITY:
221                yield self.entity(details[0])
222
223            elif type == DOCUMENT:
224                hasChildren = True
225
226            else:
227                yield self.unknown(details[0])
228
229            if hasChildren:
230                firstChild = self.getFirstChild(currentNode)
231            else:
232                firstChild = None
233
234            if firstChild is not None:
235                currentNode = firstChild
236            else:
237                while currentNode is not None:
238                    details = self.getNodeDetails(currentNode)
239                    type, details = details[0], details[1:]
240                    if type == ELEMENT:
241                        namespace, name, attributes, hasChildren = details
242                        if (namespace and namespace != namespaces["html"]) or name not in voidElements:
243                            yield self.endTag(namespace, name)
244                    if self.tree is currentNode:
245                        currentNode = None
246                        break
247                    nextSibling = self.getNextSibling(currentNode)
248                    if nextSibling is not None:
249                        currentNode = nextSibling
250                        break
251                    else:
252                        currentNode = self.getParentNode(currentNode)
253