1import re 2 3from lxml.html import tostring 4import lxml.html 5 6from calibre.ebooks.readability.cleaners import normalize_spaces, clean_attributes 7from calibre.ebooks.chardet import xml_to_unicode 8from polyglot.builtins import iteritems 9 10 11def build_doc(page): 12 page_unicode = xml_to_unicode(page, strip_encoding_pats=True)[0] 13 doc = lxml.html.document_fromstring(page_unicode) 14 return doc 15 16 17def js_re(src, pattern, flags, repl): 18 return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) 19 20 21def normalize_entities(cur_title): 22 entities = { 23 '\u2014':'-', 24 '\u2013':'-', 25 '—': '-', 26 '–': '-', 27 '\u00A0': ' ', 28 '\u00AB': '"', 29 '\u00BB': '"', 30 '"': '"', 31 } 32 for c, r in iteritems(entities): 33 if c in cur_title: 34 cur_title = cur_title.replace(c, r) 35 36 return cur_title 37 38 39def norm_title(title): 40 return normalize_entities(normalize_spaces(title)) 41 42 43def get_title(doc): 44 try: 45 title = doc.find('.//title').text 46 except AttributeError: 47 title = None 48 if not title: 49 return '[no-title]' 50 51 return norm_title(title) 52 53 54def add_match(collection, text, orig): 55 text = norm_title(text) 56 if len(text.split()) >= 2 and len(text) >= 15: 57 if text.replace('"', '') in orig.replace('"', ''): 58 collection.add(text) 59 60 61def shorten_title(doc): 62 title = doc.find('.//title').text 63 if not title: 64 return '' 65 66 title = orig = norm_title(title) 67 68 candidates = set() 69 70 for item in ['.//h1', './/h2', './/h3']: 71 for e in list(doc.iterfind(item)): 72 if e.text: 73 add_match(candidates, e.text, orig) 74 if e.text_content(): 75 add_match(candidates, e.text_content(), orig) 76 77 for item in [ 78 "descendant-or-self::*[@id = 'title']", 79 "descendant-or-self::*[@id = 'head']", 80 "descendant-or-self::*[@id = 'heading']", 81 "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' pageTitle ')]", 82 "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' news_title ')]", 83 "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' title ')]", 84 "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' head ')]", 85 "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' heading ')]", 86 "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' contentheading ')]", 87 "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' small_header_red ')]" 88 ]: 89 for e in doc.xpath(item): 90 if e.text: 91 add_match(candidates, e.text, orig) 92 if e.text_content(): 93 add_match(candidates, e.text_content(), orig) 94 95 if candidates: 96 title = sorted(candidates, key=len)[-1] 97 else: 98 for delimiter in [' | ', ' - ', ' :: ', ' / ']: 99 if delimiter in title: 100 parts = orig.split(delimiter) 101 if len(parts[0].split()) >= 4: 102 title = parts[0] 103 break 104 elif len(parts[-1].split()) >= 4: 105 title = parts[-1] 106 break 107 else: 108 if ': ' in title: 109 parts = orig.split(': ') 110 if len(parts[-1].split()) >= 4: 111 title = parts[-1] 112 else: 113 title = orig.split(': ', 1)[1] 114 115 if not 15 < len(title) < 150: 116 return orig 117 118 return title 119 120 121def get_body(doc): 122 [elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')] 123 raw_html = str(tostring(doc.body or doc)) 124 return clean_attributes(raw_html) 125