1import re
2
3from lxml.html import tostring
4import lxml.html
5
6from calibre.ebooks.readability.cleaners import normalize_spaces, clean_attributes
7from calibre.ebooks.chardet import xml_to_unicode
8from polyglot.builtins import iteritems
9
10
11def build_doc(page):
12    page_unicode = xml_to_unicode(page, strip_encoding_pats=True)[0]
13    doc = lxml.html.document_fromstring(page_unicode)
14    return doc
15
16
17def js_re(src, pattern, flags, repl):
18    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
19
20
21def normalize_entities(cur_title):
22    entities = {
23        '\u2014':'-',
24        '\u2013':'-',
25        '—': '-',
26        '–': '-',
27        '\u00A0': ' ',
28        '\u00AB': '"',
29        '\u00BB': '"',
30        '"': '"',
31    }
32    for c, r in iteritems(entities):
33        if c in cur_title:
34            cur_title = cur_title.replace(c, r)
35
36    return cur_title
37
38
39def norm_title(title):
40    return normalize_entities(normalize_spaces(title))
41
42
43def get_title(doc):
44    try:
45        title = doc.find('.//title').text
46    except AttributeError:
47        title = None
48    if not title:
49        return '[no-title]'
50
51    return norm_title(title)
52
53
54def add_match(collection, text, orig):
55    text = norm_title(text)
56    if len(text.split()) >= 2 and len(text) >= 15:
57        if text.replace('"', '') in orig.replace('"', ''):
58            collection.add(text)
59
60
61def shorten_title(doc):
62    title = doc.find('.//title').text
63    if not title:
64        return ''
65
66    title = orig = norm_title(title)
67
68    candidates = set()
69
70    for item in ['.//h1', './/h2', './/h3']:
71        for e in list(doc.iterfind(item)):
72            if e.text:
73                add_match(candidates, e.text, orig)
74            if e.text_content():
75                add_match(candidates, e.text_content(), orig)
76
77    for item in [
78            "descendant-or-self::*[@id = 'title']",
79            "descendant-or-self::*[@id = 'head']",
80            "descendant-or-self::*[@id = 'heading']",
81            "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' pageTitle ')]",
82            "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' news_title ')]",
83            "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' title ')]",
84            "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' head ')]",
85            "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' heading ')]",
86            "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' contentheading ')]",
87            "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' small_header_red ')]"
88    ]:
89        for e in doc.xpath(item):
90            if e.text:
91                add_match(candidates, e.text, orig)
92            if e.text_content():
93                add_match(candidates, e.text_content(), orig)
94
95    if candidates:
96        title = sorted(candidates, key=len)[-1]
97    else:
98        for delimiter in [' | ', ' - ', ' :: ', ' / ']:
99            if delimiter in title:
100                parts = orig.split(delimiter)
101                if len(parts[0].split()) >= 4:
102                    title = parts[0]
103                    break
104                elif len(parts[-1].split()) >= 4:
105                    title = parts[-1]
106                    break
107        else:
108            if ': ' in title:
109                parts = orig.split(': ')
110                if len(parts[-1].split()) >= 4:
111                    title = parts[-1]
112                else:
113                    title = orig.split(': ', 1)[1]
114
115    if not 15 < len(title) < 150:
116        return orig
117
118    return title
119
120
121def get_body(doc):
122    [elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
123    raw_html = str(tostring(doc.body or doc))
124    return clean_attributes(raw_html)
125