1# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
2import re
3from lxml.html.clean import Cleaner
4
5bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
6single_quoted = "'[^']+'"
7double_quoted = '"[^"]+"'
8non_space = '[^ "\'>]+'
9htmlstrip = re.compile("<"  # open
10    "([^>]+) "  # prefix
11    "(?:%s) *" % ('|'.join(bad_attrs),) +  # undesirable attributes
12    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) +  # value
13    "([^>]*)"  # postfix
14    ">"        # end
15, re.I)
16
17
18def clean_attributes(html):
19    while htmlstrip.search(html):
20        html = htmlstrip.sub(r'<\1\2>', html)
21    return html
22
23
24def normalize_spaces(s):
25    if not s:
26        return ''
27    """replace any sequence of whitespace
28    characters with a single space"""
29    return ' '.join(s.split())
30
31
32html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
33                  style=True, links=True, meta=False, add_nofollow=False,
34                  page_structure=False, processing_instructions=True, embedded=False,
35                  frames=False, forms=False, annoying_tags=False, remove_tags=None,
36                  remove_unknown_tags=False, safe_attrs_only=False)
37