1"""A collection of modules for iterating through different kinds of 2tree, generating tokens identical to those produced by the tokenizer 3module. 4 5To create a tree walker for a new type of tree, you need to 6implement a tree walker object (called TreeWalker by convention) that 7implements a 'serialize' method which takes a tree as sole argument and 8returns an iterator which generates tokens. 9""" 10 11from __future__ import absolute_import, division, unicode_literals 12 13from .. import constants 14from .._utils import default_etree 15 16__all__ = ["getTreeWalker", "pprint"] 17 18treeWalkerCache = {} 19 20 21def getTreeWalker(treeType, implementation=None, **kwargs): 22 """Get a TreeWalker class for various types of tree with built-in support 23 24 :arg str treeType: the name of the tree type required (case-insensitive). 25 Supported values are: 26 27 * "dom": The xml.dom.minidom DOM implementation 28 * "etree": A generic walker for tree implementations exposing an 29 elementtree-like interface (known to work with ElementTree, 30 cElementTree and lxml.etree). 31 * "lxml": Optimized walker for lxml.etree 32 * "genshi": a Genshi stream 33 34 :arg implementation: A module implementing the tree type e.g. 35 xml.etree.ElementTree or cElementTree (Currently applies to the "etree" 36 tree type only). 37 38 :arg kwargs: keyword arguments passed to the etree walker--for other 39 walkers, this has no effect 40 41 :returns: a TreeWalker class 42 43 """ 44 45 treeType = treeType.lower() 46 if treeType not in treeWalkerCache: 47 if treeType == "dom": 48 from . import dom 49 treeWalkerCache[treeType] = dom.TreeWalker 50 elif treeType == "genshi": 51 from . import genshi 52 treeWalkerCache[treeType] = genshi.TreeWalker 53 elif treeType == "lxml": 54 from . import etree_lxml 55 treeWalkerCache[treeType] = etree_lxml.TreeWalker 56 elif treeType == "etree": 57 from . import etree 58 if implementation is None: 59 implementation = default_etree 60 # XXX: NEVER cache here, caching is done in the etree submodule 61 return etree.getETreeModule(implementation, **kwargs).TreeWalker 62 return treeWalkerCache.get(treeType) 63 64 65def concatenateCharacterTokens(tokens): 66 pendingCharacters = [] 67 for token in tokens: 68 type = token["type"] 69 if type in ("Characters", "SpaceCharacters"): 70 pendingCharacters.append(token["data"]) 71 else: 72 if pendingCharacters: 73 yield {"type": "Characters", "data": "".join(pendingCharacters)} 74 pendingCharacters = [] 75 yield token 76 if pendingCharacters: 77 yield {"type": "Characters", "data": "".join(pendingCharacters)} 78 79 80def pprint(walker): 81 """Pretty printer for tree walkers 82 83 Takes a TreeWalker instance and pretty prints the output of walking the tree. 84 85 :arg walker: a TreeWalker instance 86 87 """ 88 output = [] 89 indent = 0 90 for token in concatenateCharacterTokens(walker): 91 type = token["type"] 92 if type in ("StartTag", "EmptyTag"): 93 # tag name 94 if token["namespace"] and token["namespace"] != constants.namespaces["html"]: 95 if token["namespace"] in constants.prefixes: 96 ns = constants.prefixes[token["namespace"]] 97 else: 98 ns = token["namespace"] 99 name = "%s %s" % (ns, token["name"]) 100 else: 101 name = token["name"] 102 output.append("%s<%s>" % (" " * indent, name)) 103 indent += 2 104 # attributes (sorted for consistent ordering) 105 attrs = token["data"] 106 for (namespace, localname), value in sorted(attrs.items()): 107 if namespace: 108 if namespace in constants.prefixes: 109 ns = constants.prefixes[namespace] 110 else: 111 ns = namespace 112 name = "%s %s" % (ns, localname) 113 else: 114 name = localname 115 output.append("%s%s=\"%s\"" % (" " * indent, name, value)) 116 # self-closing 117 if type == "EmptyTag": 118 indent -= 2 119 120 elif type == "EndTag": 121 indent -= 2 122 123 elif type == "Comment": 124 output.append("%s<!-- %s -->" % (" " * indent, token["data"])) 125 126 elif type == "Doctype": 127 if token["name"]: 128 if token["publicId"]: 129 output.append("""%s<!DOCTYPE %s "%s" "%s">""" % 130 (" " * indent, 131 token["name"], 132 token["publicId"], 133 token["systemId"] if token["systemId"] else "")) 134 elif token["systemId"]: 135 output.append("""%s<!DOCTYPE %s "" "%s">""" % 136 (" " * indent, 137 token["name"], 138 token["systemId"])) 139 else: 140 output.append("%s<!DOCTYPE %s>" % (" " * indent, 141 token["name"])) 142 else: 143 output.append("%s<!DOCTYPE >" % (" " * indent,)) 144 145 elif type == "Characters": 146 output.append("%s\"%s\"" % (" " * indent, token["data"])) 147 148 elif type == "SpaceCharacters": 149 assert False, "concatenateCharacterTokens should have got rid of all Space tokens" 150 151 else: 152 raise ValueError("Unknown token type, %s" % type) 153 154 return "\n".join(output) 155