1#!/usr/local/bin/python3.8
2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3# License: GPLv3 Copyright: 2010, Kovid Goyal <kovid at kovidgoyal.net>
4
5
6import re
7
8from calibre import prepare_string_for_xml
9from calibre.constants import preferred_encoding
10from calibre.ebooks.BeautifulSoup import (
11    BeautifulSoup, CData, Comment, Declaration, NavigableString,
12    ProcessingInstruction
13)
14from calibre.utils.html2text import html2text
15
16# Hackish - ignoring sentences ending or beginning in numbers to avoid
17# confusion with decimal points.
18lost_cr_pat = re.compile('([a-z])([\\.\\?!])([A-Z])')
19lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])')
20sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe',
21        re.IGNORECASE)
22
23
24def comments_to_html(comments):
25    '''
26    Convert random comment text to normalized, xml-legal block of <p>s
27    'plain text' returns as
28    <p>plain text</p>
29
30    'plain text with <i>minimal</i> <b>markup</b>' returns as
31    <p>plain text with <i>minimal</i> <b>markup</b></p>
32
33    '<p>pre-formatted text</p> returns untouched
34
35    'A line of text\n\nFollowed by a line of text' returns as
36    <p>A line of text</p>
37    <p>Followed by a line of text</p>
38
39    'A line of text.\nA second line of text.\rA third line of text' returns as
40    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>
41
42    '...end of a paragraph.Somehow the break was lost...' returns as
43    <p>...end of a paragraph.</p>
44    <p>Somehow the break was lost...</p>
45
46    Deprecated HTML returns as HTML via BeautifulSoup()
47
48    '''
49    if not comments:
50        return '<p></p>'
51    if not isinstance(comments, str):
52        comments = comments.decode(preferred_encoding, 'replace')
53
54    if comments.lstrip().startswith('<'):
55        # Comment is already HTML do not mess with it
56        return comments
57
58    if '<' not in comments:
59        comments = prepare_string_for_xml(comments)
60        parts = ['<p class="description">%s</p>'%x.replace('\n', '<br />')
61                for x in comments.split('\n\n')]
62        return '\n'.join(parts)
63
64    if sanitize_pat.search(comments) is not None:
65        try:
66            return sanitize_comments_html(comments)
67        except:
68            import traceback
69            traceback.print_exc()
70            return '<p></p>'
71
72    # Explode lost CRs to \n\n
73    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
74        '.\r'), comments)
75    for lost_cr in lost_cr_pat.finditer(comments):
76        comments = comments.replace(lost_cr.group(),
77                                    '%s%s\n\n%s' % (lost_cr.group(1),
78                                                    lost_cr.group(2),
79                                                    lost_cr.group(3)))
80
81    comments = comments.replace('\r', '')
82    # Convert \n\n to <p>s
83    comments = comments.replace('\n\n', '<p>')
84    # Convert solo returns to <br />
85    comments = comments.replace('\n', '<br />')
86    # Convert two hyphens to emdash
87    comments = comments.replace('--', '&mdash;')
88
89    soup = BeautifulSoup('<div>' + comments + '</div>').find('div')
90    result = BeautifulSoup('<div>')
91    container = result.find('div')
92    rtc = 0
93    open_pTag = False
94
95    all_tokens = list(soup.contents)
96    inline_tags = ('br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr')
97    for token in all_tokens:
98        if isinstance(token,  (CData, Comment, Declaration, ProcessingInstruction)):
99            continue
100        if isinstance(token, NavigableString):
101            if not open_pTag:
102                pTag = result.new_tag('p')
103                open_pTag = True
104                ptc = 0
105            pTag.insert(ptc, token)
106            ptc += 1
107        elif token.name in inline_tags:
108            if not open_pTag:
109                pTag = result.new_tag('p')
110                open_pTag = True
111                ptc = 0
112            pTag.insert(ptc, token)
113            ptc += 1
114        else:
115            if open_pTag:
116                container.insert(rtc, pTag)
117                rtc += 1
118                open_pTag = False
119                ptc = 0
120            container.insert(rtc, token)
121            rtc += 1
122
123    if open_pTag:
124        container.insert(rtc, pTag)
125
126    for p in container.findAll('p'):
127        p['class'] = 'description'
128
129    return container.decode_contents()
130
131
132def markdown(val):
133    try:
134        md = markdown.Markdown
135    except AttributeError:
136        from calibre.ebooks.markdown import Markdown
137        md = markdown.Markdown = Markdown()
138    return md.convert(val)
139
140
141def merge_comments(one, two):
142    return comments_to_html(one) + '\n\n' + comments_to_html(two)
143
144
145def sanitize_comments_html(html):
146    from calibre.ebooks.markdown import Markdown
147    text = html2text(html)
148    md = Markdown()
149    html = md.convert(text)
150    return html
151
152
153def find_tests():
154    import unittest
155
156    class Test(unittest.TestCase):
157
158        def test_comments_to_html(self):
159            for pat, val in [
160                    (b'lineone\n\nlinetwo',
161                        '<p class="description">lineone</p>\n<p class="description">linetwo</p>'),
162
163                    ('a <b>b&c</b>\nf',
164                        '<p class="description">a <b>b&amp;c</b><br/>f</p>'),
165
166                    ('a <?xml asd> b\n\ncd',
167                        '<p class="description">a  b</p><p class="description">cd</p>'),
168            ]:
169                cval = comments_to_html(pat)
170                self.assertEqual(cval, val)
171
172    return unittest.defaultTestLoader.loadTestsFromTestCase(Test)
173