1from __future__ import absolute_import, division, unicode_literals
2
3import re
4
5from . import base
6from ..constants import rcdataElements, spaceCharacters
7spaceCharacters = "".join(spaceCharacters)
8
9SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
10
11
12class Filter(base.Filter):
13    """Collapses whitespace except in pre, textarea, and script elements"""
14    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
15
16    def __iter__(self):
17        preserve = 0
18        for token in base.Filter.__iter__(self):
19            type = token["type"]
20            if type == "StartTag" \
21                    and (preserve or token["name"] in self.spacePreserveElements):
22                preserve += 1
23
24            elif type == "EndTag" and preserve:
25                preserve -= 1
26
27            elif not preserve and type == "SpaceCharacters" and token["data"]:
28                # Test on token["data"] above to not introduce spaces where there were not
29                token["data"] = " "
30
31            elif not preserve and type == "Characters":
32                token["data"] = collapse_spaces(token["data"])
33
34            yield token
35
36
37def collapse_spaces(text):
38    return SPACES_REGEX.sub(' ', text)
39