1"""A collection of modules for iterating through different kinds of
2tree, generating tokens identical to those produced by the tokenizer
3module.
4
5To create a tree walker for a new type of tree, you need to
6implement a tree walker object (called TreeWalker by convention) that
7implements a 'serialize' method which takes a tree as sole argument and
8returns an iterator which generates tokens.
9"""
10
11from __future__ import absolute_import, division, unicode_literals
12
13from .. import constants
14from .._utils import default_etree
15
16__all__ = ["getTreeWalker", "pprint"]
17
18treeWalkerCache = {}
19
20
21def getTreeWalker(treeType, implementation=None, **kwargs):
22    """Get a TreeWalker class for various types of tree with built-in support
23
24    :arg str treeType: the name of the tree type required (case-insensitive).
25        Supported values are:
26
27        * "dom": The xml.dom.minidom DOM implementation
28        * "etree": A generic walker for tree implementations exposing an
29          elementtree-like interface (known to work with ElementTree,
30          cElementTree and lxml.etree).
31        * "lxml": Optimized walker for lxml.etree
32        * "genshi": a Genshi stream
33
34    :arg implementation: A module implementing the tree type e.g.
35        xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
36        tree type only).
37
38    :arg kwargs: keyword arguments passed to the etree walker--for other
39        walkers, this has no effect
40
41    :returns: a TreeWalker class
42
43    """
44
45    treeType = treeType.lower()
46    if treeType not in treeWalkerCache:
47        if treeType == "dom":
48            from . import dom
49            treeWalkerCache[treeType] = dom.TreeWalker
50        elif treeType == "genshi":
51            from . import genshi
52            treeWalkerCache[treeType] = genshi.TreeWalker
53        elif treeType == "lxml":
54            from . import etree_lxml
55            treeWalkerCache[treeType] = etree_lxml.TreeWalker
56        elif treeType == "etree":
57            from . import etree
58            if implementation is None:
59                implementation = default_etree
60            # XXX: NEVER cache here, caching is done in the etree submodule
61            return etree.getETreeModule(implementation, **kwargs).TreeWalker
62    return treeWalkerCache.get(treeType)
63
64
65def concatenateCharacterTokens(tokens):
66    pendingCharacters = []
67    for token in tokens:
68        type = token["type"]
69        if type in ("Characters", "SpaceCharacters"):
70            pendingCharacters.append(token["data"])
71        else:
72            if pendingCharacters:
73                yield {"type": "Characters", "data": "".join(pendingCharacters)}
74                pendingCharacters = []
75            yield token
76    if pendingCharacters:
77        yield {"type": "Characters", "data": "".join(pendingCharacters)}
78
79
80def pprint(walker):
81    """Pretty printer for tree walkers
82
83    Takes a TreeWalker instance and pretty prints the output of walking the tree.
84
85    :arg walker: a TreeWalker instance
86
87    """
88    output = []
89    indent = 0
90    for token in concatenateCharacterTokens(walker):
91        type = token["type"]
92        if type in ("StartTag", "EmptyTag"):
93            # tag name
94            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
95                if token["namespace"] in constants.prefixes:
96                    ns = constants.prefixes[token["namespace"]]
97                else:
98                    ns = token["namespace"]
99                name = "%s %s" % (ns, token["name"])
100            else:
101                name = token["name"]
102            output.append("%s<%s>" % (" " * indent, name))
103            indent += 2
104            # attributes (sorted for consistent ordering)
105            attrs = token["data"]
106            for (namespace, localname), value in sorted(attrs.items()):
107                if namespace:
108                    if namespace in constants.prefixes:
109                        ns = constants.prefixes[namespace]
110                    else:
111                        ns = namespace
112                    name = "%s %s" % (ns, localname)
113                else:
114                    name = localname
115                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
116            # self-closing
117            if type == "EmptyTag":
118                indent -= 2
119
120        elif type == "EndTag":
121            indent -= 2
122
123        elif type == "Comment":
124            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
125
126        elif type == "Doctype":
127            if token["name"]:
128                if token["publicId"]:
129                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
130                                  (" " * indent,
131                                   token["name"],
132                                   token["publicId"],
133                                   token["systemId"] if token["systemId"] else ""))
134                elif token["systemId"]:
135                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
136                                  (" " * indent,
137                                   token["name"],
138                                   token["systemId"]))
139                else:
140                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
141                                                       token["name"]))
142            else:
143                output.append("%s<!DOCTYPE >" % (" " * indent,))
144
145        elif type == "Characters":
146            output.append("%s\"%s\"" % (" " * indent, token["data"]))
147
148        elif type == "SpaceCharacters":
149            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
150
151        else:
152            raise ValueError("Unknown token type, %s" % type)
153
154    return "\n".join(output)
155