ebooks/readability/cleaners.py

# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
import re
from lxml.html.clean import Cleaner

bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
htmlstrip = re.compile("<"  # open
    "([^>]+) "  # prefix
    "(?:%s) *" % ('|'.join(bad_attrs),) +  # undesirable attributes
    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) +  # value
    "([^>]*)"  # postfix
    ">"        # end
, re.I)


def clean_attributes(html):
    while htmlstrip.search(html):
        html = htmlstrip.sub(r'<\1\2>', html)
    return html


def normalize_spaces(s):
    if not s:
        return ''
    """replace any sequence of whitespace
    characters with a single space"""
    return ' '.join(s.split())


html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                  style=True, links=True, meta=False, add_nofollow=False,
                  page_structure=False, processing_instructions=True, embedded=False,
                  frames=False, forms=False, annoying_tags=False, remove_tags=None,
                  remove_unknown_tags=False, safe_attrs_only=False)