1# -*- coding: utf-8 -*- 2 3 4__license__ = 'GPL 3' 5__copyright__ = '2009, John Schember <john@nachtimwald.com>' 6__docformat__ = 'restructuredtext en' 7 8import os 9 10from calibre import _ent_pat, walk, xml_entity_to_unicode 11from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation 12 13MD_EXTENSIONS = { 14 'abbr': _('Abbreviations'), 15 'admonition': _('Support admonitions'), 16 'attr_list': _('Add attribute to HTML tags'), 17 'codehilite': _('Add code highlighting via Pygments'), 18 'def_list': _('Definition lists'), 19 'extra': _('Enables various common extensions'), 20 'fenced_code': _('Alternative code block syntax'), 21 'footnotes': _('Footnotes'), 22 'legacy_attrs': _('Use legacy element attributes'), 23 'legacy_em': _('Use legacy underscore handling for connected words'), 24 'meta': _('Metadata in the document'), 25 'nl2br': _('Treat newlines as hard breaks'), 26 'sane_lists': _('Do not allow mixing list types'), 27 'smarty': _('Use markdown\'s internal smartypants parser'), 28 'tables': _('Support tables'), 29 'toc': _('Generate a table of contents'), 30 'wikilinks': _('Wiki style links'), 31} 32 33 34class TXTInput(InputFormatPlugin): 35 36 name = 'TXT Input' 37 author = 'John Schember' 38 description = _('Convert TXT files to HTML') 39 file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'} 40 commit_name = 'txt_input' 41 ui_data = { 42 'md_extensions': MD_EXTENSIONS, 43 'paragraph_types': { 44 'auto': _('Try to auto detect paragraph type'), 45 'block': _('Treat a blank line as a paragraph break'), 46 'single': _('Assume every line is a paragraph'), 47 'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'), 48 'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'), 49 'off': _('Don\'t modify the paragraph structure'), 50 }, 51 'formatting_types': { 52 'auto': _('Automatically decide which formatting processor to use'), 53 'plain': _('No formatting'), 54 'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'), 55 'textile': _('Use the TexTile markup language'), 56 'markdown': _('Use the Markdown markup language') 57 }, 58 } 59 60 options = { 61 OptionRecommendation(name='formatting_type', recommended_value='auto', 62 choices=list(ui_data['formatting_types']), 63 help=_('Formatting used within the document.\n' 64 '* auto: {auto}\n' 65 '* plain: {plain}\n' 66 '* heuristic: {heuristic}\n' 67 '* textile: {textile}\n' 68 '* markdown: {markdown}\n' 69 'To learn more about markdown see {url}').format( 70 url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types']) 71 ), 72 OptionRecommendation(name='paragraph_type', recommended_value='auto', 73 choices=list(ui_data['paragraph_types']), 74 help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. ' 75 'Choices are:\n' 76 '* auto: {auto}\n' 77 '* block: {block}\n' 78 '* single: {single}\n' 79 '* print: {print}\n' 80 '* unformatted: {unformatted}\n' 81 '* off: {off}').format(**ui_data['paragraph_types']) 82 ), 83 OptionRecommendation(name='preserve_spaces', recommended_value=False, 84 help=_('Normally extra spaces are condensed into a single space. ' 85 'With this option all spaces will be displayed.')), 86 OptionRecommendation(name='txt_in_remove_indents', recommended_value=False, 87 help=_('Normally extra space at the beginning of lines is retained. ' 88 'With this option they will be removed.')), 89 OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc', 90 help=_('Enable extensions to Markdown syntax. Extensions are formatting that is not part ' 91 'of the standard Markdown format. The extensions enabled by default: %default.\n' 92 'To learn more about Markdown extensions, see {}\n' 93 'This should be a comma separated list of extensions to enable:\n' 94 ).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))), 95 } 96 97 def shift_file(self, fname, data): 98 name, ext = os.path.splitext(fname) 99 candidate = os.path.join(self.output_dir, fname) 100 c = 0 101 while os.path.exists(candidate): 102 c += 1 103 candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext)) 104 ans = candidate 105 with open(ans, 'wb') as f: 106 f.write(data) 107 return f.name 108 109 def fix_resources(self, html, base_dir): 110 from html5_parser import parse 111 root = parse(html) 112 changed = False 113 for img in root.xpath('//img[@src]'): 114 src = img.get('src') 115 prefix = src.split(':', 1)[0].lower() 116 if src and prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src): 117 src = os.path.join(base_dir, src) 118 if os.path.isfile(src) and os.access(src, os.R_OK): 119 with open(src, 'rb') as f: 120 data = f.read() 121 f = self.shift_file(os.path.basename(src), data) 122 changed = True 123 img.set('src', os.path.basename(f)) 124 if changed: 125 from lxml import etree 126 html = etree.tostring(root, encoding='unicode') 127 return html 128 129 def convert(self, stream, options, file_ext, log, 130 accelerators): 131 from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator 132 from calibre.ebooks.chardet import detect 133 from calibre.utils.zipfile import ZipFile 134 from calibre.ebooks.txt.processor import (convert_basic, 135 convert_markdown_with_metadata, separate_paragraphs_single_line, 136 separate_paragraphs_print_formatted, preserve_spaces, 137 detect_paragraph_type, detect_formatting_type, 138 normalize_line_endings, convert_textile, remove_indents, 139 block_to_single_line, separate_hard_scene_breaks) 140 141 self.log = log 142 txt = b'' 143 log.debug('Reading text from file...') 144 length = 0 145 base_dir = self.output_dir = os.getcwd() 146 147 # Extract content from zip archive. 148 if file_ext == 'txtz': 149 options.input_encoding = 'utf-8' 150 zf = ZipFile(stream) 151 zf.extractall('.') 152 153 for x in walk('.'): 154 ext = os.path.splitext(x)[1].lower() 155 if ext in ('.txt', '.text', '.textile', '.md', '.markdown'): 156 file_ext = ext 157 with open(x, 'rb') as tf: 158 txt += tf.read() + b'\n\n' 159 if os.path.exists('metadata.opf'): 160 from lxml import etree 161 with open('metadata.opf', 'rb') as mf: 162 raw = mf.read() 163 try: 164 root = etree.fromstring(raw) 165 except Exception: 166 pass 167 else: 168 txt_formatting = root.find('text-formatting') 169 if txt_formatting is not None and txt_formatting.text: 170 txt_formatting = txt_formatting.text.strip() 171 if txt_formatting in ('plain', 'textile', 'markdown') and options.formatting_type == 'auto': 172 log.info(f'Using metadata from TXTZ archive to set text formatting type to: {txt_formatting}') 173 options.formatting_type = txt_formatting 174 if txt_formatting != 'plain': 175 options.paragraph_type = 'off' 176 if options.formatting_type == 'auto': 177 if file_ext == 'textile': 178 options.formatting_type = txt_formatting 179 options.paragraph_type = 'off' 180 elif file_ext in ('md', 'markdown'): 181 options.formatting_type = txt_formatting 182 options.paragraph_type = 'off' 183 else: 184 if getattr(stream, 'name', None): 185 base_dir = os.path.dirname(stream.name) 186 txt = stream.read() 187 if file_ext in {'md', 'textile', 'markdown'}: 188 options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext) 189 log.info('File extension indicates particular formatting. ' 190 'Forcing formatting type to: %s'%options.formatting_type) 191 options.paragraph_type = 'off' 192 193 # Get the encoding of the document. 194 if options.input_encoding: 195 ienc = options.input_encoding 196 log.debug('Using user specified input encoding of %s' % ienc) 197 else: 198 det_encoding = detect(txt[:4096]) 199 det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence'] 200 if det_encoding and det_encoding.lower().replace('_', '-').strip() in ( 201 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 202 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): 203 # Microsoft Word exports to HTML with encoding incorrectly set to 204 # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. 205 det_encoding = 'gbk' 206 ienc = det_encoding 207 log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100)) 208 if not ienc: 209 ienc = 'utf-8' 210 log.debug('No input encoding specified and could not auto detect using %s' % ienc) 211 # Remove BOM from start of txt as its presence can confuse markdown 212 import codecs 213 for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): 214 if txt.startswith(bom): 215 txt = txt[len(bom):] 216 break 217 txt = txt.decode(ienc, 'replace') 218 219 # Replace entities 220 txt = _ent_pat.sub(xml_entity_to_unicode, txt) 221 222 # Normalize line endings 223 txt = normalize_line_endings(txt) 224 225 # Determine the paragraph type of the document. 226 if options.paragraph_type == 'auto': 227 options.paragraph_type = detect_paragraph_type(txt) 228 if options.paragraph_type == 'unknown': 229 log.debug('Could not reliably determine paragraph type using block') 230 options.paragraph_type = 'block' 231 else: 232 log.debug('Auto detected paragraph type as %s' % options.paragraph_type) 233 234 # Detect formatting 235 if options.formatting_type == 'auto': 236 options.formatting_type = detect_formatting_type(txt) 237 log.debug('Auto detected formatting as %s' % options.formatting_type) 238 239 if options.formatting_type == 'heuristic': 240 setattr(options, 'enable_heuristics', True) 241 setattr(options, 'unwrap_lines', False) 242 setattr(options, 'smarten_punctuation', True) 243 244 # Reformat paragraphs to block formatting based on the detected type. 245 # We don't check for block because the processor assumes block. 246 # single and print at transformed to block for processing. 247 if options.paragraph_type == 'single': 248 txt = separate_paragraphs_single_line(txt) 249 elif options.paragraph_type == 'print': 250 txt = separate_hard_scene_breaks(txt) 251 txt = separate_paragraphs_print_formatted(txt) 252 txt = block_to_single_line(txt) 253 elif options.paragraph_type == 'unformatted': 254 from calibre.ebooks.conversion.utils import HeuristicProcessor 255 # unwrap lines based on punctuation 256 docanalysis = DocAnalysis('txt', txt) 257 length = docanalysis.line_length(.5) 258 preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) 259 txt = preprocessor.punctuation_unwrap(length, txt, 'txt') 260 txt = separate_paragraphs_single_line(txt) 261 elif options.paragraph_type == 'block': 262 txt = separate_hard_scene_breaks(txt) 263 txt = block_to_single_line(txt) 264 265 if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): 266 docanalysis = DocAnalysis('txt', txt) 267 if not length: 268 length = docanalysis.line_length(.5) 269 dehyphenator = Dehyphenator(options.verbose, log=self.log) 270 txt = dehyphenator(txt,'txt', length) 271 272 # User requested transformation on the text. 273 if options.txt_in_remove_indents: 274 txt = remove_indents(txt) 275 276 # Preserve spaces will replace multiple spaces to a space 277 # followed by the entity. 278 if options.preserve_spaces: 279 txt = preserve_spaces(txt) 280 281 # Process the text using the appropriate text processor. 282 self.shifted_files = [] 283 try: 284 html = '' 285 input_mi = None 286 if options.formatting_type == 'markdown': 287 log.debug('Running text through markdown conversion...') 288 try: 289 input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()]) 290 except RuntimeError: 291 raise ValueError('This txt file has malformed markup, it cannot be' 292 ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax') 293 html = self.fix_resources(html, base_dir) 294 elif options.formatting_type == 'textile': 295 log.debug('Running text through textile conversion...') 296 html = convert_textile(txt) 297 html = self.fix_resources(html, base_dir) 298 else: 299 log.debug('Running text through basic conversion...') 300 flow_size = getattr(options, 'flow_size', 0) 301 html = convert_basic(txt, epub_split_size_kb=flow_size) 302 303 # Run the HTMLized text through the html processing plugin. 304 from calibre.customize.ui import plugin_for_input_format 305 html_input = plugin_for_input_format('html') 306 for opt in html_input.options: 307 setattr(options, opt.option.name, opt.recommended_value) 308 options.input_encoding = 'utf-8' 309 htmlfile = self.shift_file('index.html', html.encode('utf-8')) 310 odi = options.debug_pipeline 311 options.debug_pipeline = None 312 # Generate oeb from html conversion. 313 oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {}) 314 options.debug_pipeline = odi 315 finally: 316 for x in self.shifted_files: 317 os.remove(x) 318 319 # Set metadata from file. 320 if input_mi is None: 321 from calibre.customize.ui import get_file_type_metadata 322 input_mi = get_file_type_metadata(stream, file_ext) 323 from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata 324 meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) 325 self.html_postprocess_title = input_mi.title 326 327 return oeb 328 329 def postprocess_book(self, oeb, opts, log): 330 for item in oeb.spine: 331 if hasattr(item.data, 'xpath'): 332 for title in item.data.xpath('//*[local-name()="title"]'): 333 if title.text == _('Unknown'): 334 title.text = self.html_postprocess_title 335