1from __future__ import absolute_import, division, unicode_literals 2 3from xml.dom import Node 4from ..constants import namespaces, voidElements, spaceCharacters 5 6__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", 7 "TreeWalker", "NonRecursiveTreeWalker"] 8 9DOCUMENT = Node.DOCUMENT_NODE 10DOCTYPE = Node.DOCUMENT_TYPE_NODE 11TEXT = Node.TEXT_NODE 12ELEMENT = Node.ELEMENT_NODE 13COMMENT = Node.COMMENT_NODE 14ENTITY = Node.ENTITY_NODE 15UNKNOWN = "<#UNKNOWN#>" 16 17spaceCharacters = "".join(spaceCharacters) 18 19 20class TreeWalker(object): 21 """Walks a tree yielding tokens 22 23 Tokens are dicts that all have a ``type`` field specifying the type of the 24 token. 25 26 """ 27 def __init__(self, tree): 28 """Creates a TreeWalker 29 30 :arg tree: the tree to walk 31 32 """ 33 self.tree = tree 34 35 def __iter__(self): 36 raise NotImplementedError 37 38 def error(self, msg): 39 """Generates an error token with the given message 40 41 :arg msg: the error message 42 43 :returns: SerializeError token 44 45 """ 46 return {"type": "SerializeError", "data": msg} 47 48 def emptyTag(self, namespace, name, attrs, hasChildren=False): 49 """Generates an EmptyTag token 50 51 :arg namespace: the namespace of the token--can be ``None`` 52 53 :arg name: the name of the element 54 55 :arg attrs: the attributes of the element as a dict 56 57 :arg hasChildren: whether or not to yield a SerializationError because 58 this tag shouldn't have children 59 60 :returns: EmptyTag token 61 62 """ 63 yield {"type": "EmptyTag", "name": name, 64 "namespace": namespace, 65 "data": attrs} 66 if hasChildren: 67 yield self.error("Void element has children") 68 69 def startTag(self, namespace, name, attrs): 70 """Generates a StartTag token 71 72 :arg namespace: the namespace of the token--can be ``None`` 73 74 :arg name: the name of the element 75 76 :arg attrs: the attributes of the element as a dict 77 78 :returns: StartTag token 79 80 """ 81 return {"type": "StartTag", 82 "name": name, 83 "namespace": namespace, 84 "data": attrs} 85 86 def endTag(self, namespace, name): 87 """Generates an EndTag token 88 89 :arg namespace: the namespace of the token--can be ``None`` 90 91 :arg name: the name of the element 92 93 :returns: EndTag token 94 95 """ 96 return {"type": "EndTag", 97 "name": name, 98 "namespace": namespace} 99 100 def text(self, data): 101 """Generates SpaceCharacters and Characters tokens 102 103 Depending on what's in the data, this generates one or more 104 ``SpaceCharacters`` and ``Characters`` tokens. 105 106 For example: 107 108 >>> from html5lib.treewalkers.base import TreeWalker 109 >>> # Give it an empty tree just so it instantiates 110 >>> walker = TreeWalker([]) 111 >>> list(walker.text('')) 112 [] 113 >>> list(walker.text(' ')) 114 [{u'data': ' ', u'type': u'SpaceCharacters'}] 115 >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE 116 [{u'data': ' ', u'type': u'SpaceCharacters'}, 117 {u'data': u'abc', u'type': u'Characters'}, 118 {u'data': u' ', u'type': u'SpaceCharacters'}] 119 120 :arg data: the text data 121 122 :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens 123 124 """ 125 data = data 126 middle = data.lstrip(spaceCharacters) 127 left = data[:len(data) - len(middle)] 128 if left: 129 yield {"type": "SpaceCharacters", "data": left} 130 data = middle 131 middle = data.rstrip(spaceCharacters) 132 right = data[len(middle):] 133 if middle: 134 yield {"type": "Characters", "data": middle} 135 if right: 136 yield {"type": "SpaceCharacters", "data": right} 137 138 def comment(self, data): 139 """Generates a Comment token 140 141 :arg data: the comment 142 143 :returns: Comment token 144 145 """ 146 return {"type": "Comment", "data": data} 147 148 def doctype(self, name, publicId=None, systemId=None): 149 """Generates a Doctype token 150 151 :arg name: 152 153 :arg publicId: 154 155 :arg systemId: 156 157 :returns: the Doctype token 158 159 """ 160 return {"type": "Doctype", 161 "name": name, 162 "publicId": publicId, 163 "systemId": systemId} 164 165 def entity(self, name): 166 """Generates an Entity token 167 168 :arg name: the entity name 169 170 :returns: an Entity token 171 172 """ 173 return {"type": "Entity", "name": name} 174 175 def unknown(self, nodeType): 176 """Handles unknown node types""" 177 return self.error("Unknown node type: " + nodeType) 178 179 180class NonRecursiveTreeWalker(TreeWalker): 181 def getNodeDetails(self, node): 182 raise NotImplementedError 183 184 def getFirstChild(self, node): 185 raise NotImplementedError 186 187 def getNextSibling(self, node): 188 raise NotImplementedError 189 190 def getParentNode(self, node): 191 raise NotImplementedError 192 193 def __iter__(self): 194 currentNode = self.tree 195 while currentNode is not None: 196 details = self.getNodeDetails(currentNode) 197 type, details = details[0], details[1:] 198 hasChildren = False 199 200 if type == DOCTYPE: 201 yield self.doctype(*details) 202 203 elif type == TEXT: 204 for token in self.text(*details): 205 yield token 206 207 elif type == ELEMENT: 208 namespace, name, attributes, hasChildren = details 209 if (not namespace or namespace == namespaces["html"]) and name in voidElements: 210 for token in self.emptyTag(namespace, name, attributes, 211 hasChildren): 212 yield token 213 hasChildren = False 214 else: 215 yield self.startTag(namespace, name, attributes) 216 217 elif type == COMMENT: 218 yield self.comment(details[0]) 219 220 elif type == ENTITY: 221 yield self.entity(details[0]) 222 223 elif type == DOCUMENT: 224 hasChildren = True 225 226 else: 227 yield self.unknown(details[0]) 228 229 if hasChildren: 230 firstChild = self.getFirstChild(currentNode) 231 else: 232 firstChild = None 233 234 if firstChild is not None: 235 currentNode = firstChild 236 else: 237 while currentNode is not None: 238 details = self.getNodeDetails(currentNode) 239 type, details = details[0], details[1:] 240 if type == ELEMENT: 241 namespace, name, attributes, hasChildren = details 242 if (namespace and namespace != namespaces["html"]) or name not in voidElements: 243 yield self.endTag(namespace, name) 244 if self.tree is currentNode: 245 currentNode = None 246 break 247 nextSibling = self.getNextSibling(currentNode) 248 if nextSibling is not None: 249 currentNode = nextSibling 250 break 251 else: 252 currentNode = self.getParentNode(currentNode) 253