1# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds 2import re 3from lxml.html.clean import Cleaner 4 5bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*'] 6single_quoted = "'[^']+'" 7double_quoted = '"[^"]+"' 8non_space = '[^ "\'>]+' 9htmlstrip = re.compile("<" # open 10 "([^>]+) " # prefix 11 "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes 12 '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value 13 "([^>]*)" # postfix 14 ">" # end 15, re.I) 16 17 18def clean_attributes(html): 19 while htmlstrip.search(html): 20 html = htmlstrip.sub(r'<\1\2>', html) 21 return html 22 23 24def normalize_spaces(s): 25 if not s: 26 return '' 27 """replace any sequence of whitespace 28 characters with a single space""" 29 return ' '.join(s.split()) 30 31 32html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, 33 style=True, links=True, meta=False, add_nofollow=False, 34 page_structure=False, processing_instructions=True, embedded=False, 35 frames=False, forms=False, annoying_tags=False, remove_tags=None, 36 remove_unknown_tags=False, safe_attrs_only=False) 37