1from __future__ import absolute_import, division, unicode_literals 2 3import re 4 5from . import base 6from ..constants import rcdataElements, spaceCharacters 7spaceCharacters = "".join(spaceCharacters) 8 9SPACES_REGEX = re.compile("[%s]+" % spaceCharacters) 10 11 12class Filter(base.Filter): 13 """Collapses whitespace except in pre, textarea, and script elements""" 14 spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) 15 16 def __iter__(self): 17 preserve = 0 18 for token in base.Filter.__iter__(self): 19 type = token["type"] 20 if type == "StartTag" \ 21 and (preserve or token["name"] in self.spacePreserveElements): 22 preserve += 1 23 24 elif type == "EndTag" and preserve: 25 preserve -= 1 26 27 elif not preserve and type == "SpaceCharacters" and token["data"]: 28 # Test on token["data"] above to not introduce spaces where there were not 29 token["data"] = " " 30 31 elif not preserve and type == "Characters": 32 token["data"] = collapse_spaces(token["data"]) 33 34 yield token 35 36 37def collapse_spaces(text): 38 return SPACES_REGEX.sub(' ', text) 39