1#!/usr/local/bin/python3.8 2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3# License: GPLv3 Copyright: 2010, Kovid Goyal <kovid at kovidgoyal.net> 4 5 6import re 7 8from calibre import prepare_string_for_xml 9from calibre.constants import preferred_encoding 10from calibre.ebooks.BeautifulSoup import ( 11 BeautifulSoup, CData, Comment, Declaration, NavigableString, 12 ProcessingInstruction 13) 14from calibre.utils.html2text import html2text 15 16# Hackish - ignoring sentences ending or beginning in numbers to avoid 17# confusion with decimal points. 18lost_cr_pat = re.compile('([a-z])([\\.\\?!])([A-Z])') 19lost_cr_exception_pat = re.compile(r'(Ph\.D)|(D\.Phil)|((Dr|Mr|Mrs|Ms)\.[A-Z])') 20sanitize_pat = re.compile(r'<script|<table|<tr|<td|<th|<style|<iframe', 21 re.IGNORECASE) 22 23 24def comments_to_html(comments): 25 ''' 26 Convert random comment text to normalized, xml-legal block of <p>s 27 'plain text' returns as 28 <p>plain text</p> 29 30 'plain text with <i>minimal</i> <b>markup</b>' returns as 31 <p>plain text with <i>minimal</i> <b>markup</b></p> 32 33 '<p>pre-formatted text</p> returns untouched 34 35 'A line of text\n\nFollowed by a line of text' returns as 36 <p>A line of text</p> 37 <p>Followed by a line of text</p> 38 39 'A line of text.\nA second line of text.\rA third line of text' returns as 40 <p>A line of text.<br />A second line of text.<br />A third line of text.</p> 41 42 '...end of a paragraph.Somehow the break was lost...' returns as 43 <p>...end of a paragraph.</p> 44 <p>Somehow the break was lost...</p> 45 46 Deprecated HTML returns as HTML via BeautifulSoup() 47 48 ''' 49 if not comments: 50 return '<p></p>' 51 if not isinstance(comments, str): 52 comments = comments.decode(preferred_encoding, 'replace') 53 54 if comments.lstrip().startswith('<'): 55 # Comment is already HTML do not mess with it 56 return comments 57 58 if '<' not in comments: 59 comments = prepare_string_for_xml(comments) 60 parts = ['<p class="description">%s</p>'%x.replace('\n', '<br />') 61 for x in comments.split('\n\n')] 62 return '\n'.join(parts) 63 64 if sanitize_pat.search(comments) is not None: 65 try: 66 return sanitize_comments_html(comments) 67 except: 68 import traceback 69 traceback.print_exc() 70 return '<p></p>' 71 72 # Explode lost CRs to \n\n 73 comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', 74 '.\r'), comments) 75 for lost_cr in lost_cr_pat.finditer(comments): 76 comments = comments.replace(lost_cr.group(), 77 '%s%s\n\n%s' % (lost_cr.group(1), 78 lost_cr.group(2), 79 lost_cr.group(3))) 80 81 comments = comments.replace('\r', '') 82 # Convert \n\n to <p>s 83 comments = comments.replace('\n\n', '<p>') 84 # Convert solo returns to <br /> 85 comments = comments.replace('\n', '<br />') 86 # Convert two hyphens to emdash 87 comments = comments.replace('--', '—') 88 89 soup = BeautifulSoup('<div>' + comments + '</div>').find('div') 90 result = BeautifulSoup('<div>') 91 container = result.find('div') 92 rtc = 0 93 open_pTag = False 94 95 all_tokens = list(soup.contents) 96 inline_tags = ('br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr') 97 for token in all_tokens: 98 if isinstance(token, (CData, Comment, Declaration, ProcessingInstruction)): 99 continue 100 if isinstance(token, NavigableString): 101 if not open_pTag: 102 pTag = result.new_tag('p') 103 open_pTag = True 104 ptc = 0 105 pTag.insert(ptc, token) 106 ptc += 1 107 elif token.name in inline_tags: 108 if not open_pTag: 109 pTag = result.new_tag('p') 110 open_pTag = True 111 ptc = 0 112 pTag.insert(ptc, token) 113 ptc += 1 114 else: 115 if open_pTag: 116 container.insert(rtc, pTag) 117 rtc += 1 118 open_pTag = False 119 ptc = 0 120 container.insert(rtc, token) 121 rtc += 1 122 123 if open_pTag: 124 container.insert(rtc, pTag) 125 126 for p in container.findAll('p'): 127 p['class'] = 'description' 128 129 return container.decode_contents() 130 131 132def markdown(val): 133 try: 134 md = markdown.Markdown 135 except AttributeError: 136 from calibre.ebooks.markdown import Markdown 137 md = markdown.Markdown = Markdown() 138 return md.convert(val) 139 140 141def merge_comments(one, two): 142 return comments_to_html(one) + '\n\n' + comments_to_html(two) 143 144 145def sanitize_comments_html(html): 146 from calibre.ebooks.markdown import Markdown 147 text = html2text(html) 148 md = Markdown() 149 html = md.convert(text) 150 return html 151 152 153def find_tests(): 154 import unittest 155 156 class Test(unittest.TestCase): 157 158 def test_comments_to_html(self): 159 for pat, val in [ 160 (b'lineone\n\nlinetwo', 161 '<p class="description">lineone</p>\n<p class="description">linetwo</p>'), 162 163 ('a <b>b&c</b>\nf', 164 '<p class="description">a <b>b&c</b><br/>f</p>'), 165 166 ('a <?xml asd> b\n\ncd', 167 '<p class="description">a b</p><p class="description">cd</p>'), 168 ]: 169 cval = comments_to_html(pat) 170 self.assertEqual(cval, val) 171 172 return unittest.defaultTestLoader.loadTestsFromTestCase(Test) 173