1# -*- coding: utf-8 -*-
2
3
4__license__ = 'GPL 3'
5__copyright__ = '2009, John Schember <john@nachtimwald.com>'
6__docformat__ = 'restructuredtext en'
7
8import os
9
10from calibre import _ent_pat, walk, xml_entity_to_unicode
11from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
12
13MD_EXTENSIONS = {
14    'abbr': _('Abbreviations'),
15    'admonition': _('Support admonitions'),
16    'attr_list': _('Add attribute to HTML tags'),
17    'codehilite': _('Add code highlighting via Pygments'),
18    'def_list': _('Definition lists'),
19    'extra': _('Enables various common extensions'),
20    'fenced_code': _('Alternative code block syntax'),
21    'footnotes': _('Footnotes'),
22    'legacy_attrs': _('Use legacy element attributes'),
23    'legacy_em': _('Use legacy underscore handling for connected words'),
24    'meta': _('Metadata in the document'),
25    'nl2br': _('Treat newlines as hard breaks'),
26    'sane_lists': _('Do not allow mixing list types'),
27    'smarty': _('Use markdown\'s internal smartypants parser'),
28    'tables': _('Support tables'),
29    'toc': _('Generate a table of contents'),
30    'wikilinks': _('Wiki style links'),
31}
32
33
34class TXTInput(InputFormatPlugin):
35
36    name        = 'TXT Input'
37    author      = 'John Schember'
38    description = _('Convert TXT files to HTML')
39    file_types  = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
40    commit_name = 'txt_input'
41    ui_data = {
42        'md_extensions': MD_EXTENSIONS,
43        'paragraph_types': {
44            'auto': _('Try to auto detect paragraph type'),
45            'block': _('Treat a blank line as a paragraph break'),
46            'single': _('Assume every line is a paragraph'),
47            'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
48            'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
49            'off': _('Don\'t modify the paragraph structure'),
50        },
51        'formatting_types': {
52            'auto': _('Automatically decide which formatting processor to use'),
53            'plain': _('No formatting'),
54            'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
55            'textile': _('Use the TexTile markup language'),
56            'markdown': _('Use the Markdown markup language')
57        },
58    }
59
60    options = {
61        OptionRecommendation(name='formatting_type', recommended_value='auto',
62            choices=list(ui_data['formatting_types']),
63            help=_('Formatting used within the document.\n'
64                   '* auto: {auto}\n'
65                   '* plain: {plain}\n'
66                   '* heuristic: {heuristic}\n'
67                   '* textile: {textile}\n'
68                   '* markdown: {markdown}\n'
69                   'To learn more about markdown see {url}').format(
70                       url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
71        ),
72        OptionRecommendation(name='paragraph_type', recommended_value='auto',
73            choices=list(ui_data['paragraph_types']),
74            help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
75                   'Choices are:\n'
76                   '* auto: {auto}\n'
77                   '* block: {block}\n'
78                   '* single: {single}\n'
79                   '* print:  {print}\n'
80                   '* unformatted: {unformatted}\n'
81                   '* off: {off}').format(**ui_data['paragraph_types'])
82        ),
83        OptionRecommendation(name='preserve_spaces', recommended_value=False,
84            help=_('Normally extra spaces are condensed into a single space. '
85                'With this option all spaces will be displayed.')),
86        OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
87            help=_('Normally extra space at the beginning of lines is retained. '
88                   'With this option they will be removed.')),
89        OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
90            help=_('Enable extensions to Markdown syntax. Extensions are formatting that is not part '
91                   'of the standard Markdown format. The extensions enabled by default: %default.\n'
92                   'To learn more about Markdown extensions, see {}\n'
93                   'This should be a comma separated list of extensions to enable:\n'
94                   ).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
95    }
96
97    def shift_file(self, fname, data):
98        name, ext = os.path.splitext(fname)
99        candidate = os.path.join(self.output_dir, fname)
100        c = 0
101        while os.path.exists(candidate):
102            c += 1
103            candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext))
104        ans = candidate
105        with open(ans, 'wb') as f:
106            f.write(data)
107        return f.name
108
109    def fix_resources(self, html, base_dir):
110        from html5_parser import parse
111        root = parse(html)
112        changed = False
113        for img in root.xpath('//img[@src]'):
114            src = img.get('src')
115            prefix = src.split(':', 1)[0].lower()
116            if src and prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
117                src = os.path.join(base_dir, src)
118                if os.path.isfile(src) and os.access(src, os.R_OK):
119                    with open(src, 'rb') as f:
120                        data = f.read()
121                    f = self.shift_file(os.path.basename(src), data)
122                    changed = True
123                    img.set('src', os.path.basename(f))
124        if changed:
125            from lxml import etree
126            html = etree.tostring(root, encoding='unicode')
127        return html
128
129    def convert(self, stream, options, file_ext, log,
130                accelerators):
131        from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
132        from calibre.ebooks.chardet import detect
133        from calibre.utils.zipfile import ZipFile
134        from calibre.ebooks.txt.processor import (convert_basic,
135                convert_markdown_with_metadata, separate_paragraphs_single_line,
136                separate_paragraphs_print_formatted, preserve_spaces,
137                detect_paragraph_type, detect_formatting_type,
138                normalize_line_endings, convert_textile, remove_indents,
139                block_to_single_line, separate_hard_scene_breaks)
140
141        self.log = log
142        txt = b''
143        log.debug('Reading text from file...')
144        length = 0
145        base_dir = self.output_dir = os.getcwd()
146
147        # Extract content from zip archive.
148        if file_ext == 'txtz':
149            options.input_encoding = 'utf-8'
150            zf = ZipFile(stream)
151            zf.extractall('.')
152
153            for x in walk('.'):
154                ext = os.path.splitext(x)[1].lower()
155                if ext in ('.txt', '.text', '.textile', '.md', '.markdown'):
156                    file_ext = ext
157                    with open(x, 'rb') as tf:
158                        txt += tf.read() + b'\n\n'
159            if os.path.exists('metadata.opf'):
160                from lxml import etree
161                with open('metadata.opf', 'rb') as mf:
162                    raw = mf.read()
163                try:
164                    root = etree.fromstring(raw)
165                except Exception:
166                    pass
167                else:
168                    txt_formatting = root.find('text-formatting')
169                    if txt_formatting is not None and txt_formatting.text:
170                        txt_formatting = txt_formatting.text.strip()
171                        if txt_formatting in ('plain', 'textile', 'markdown') and options.formatting_type == 'auto':
172                            log.info(f'Using metadata from TXTZ archive to set text formatting type to: {txt_formatting}')
173                            options.formatting_type = txt_formatting
174                            if txt_formatting != 'plain':
175                                options.paragraph_type = 'off'
176            if options.formatting_type == 'auto':
177                if file_ext == 'textile':
178                    options.formatting_type = txt_formatting
179                    options.paragraph_type = 'off'
180                elif file_ext in ('md', 'markdown'):
181                    options.formatting_type = txt_formatting
182                    options.paragraph_type = 'off'
183        else:
184            if getattr(stream, 'name', None):
185                base_dir = os.path.dirname(stream.name)
186            txt = stream.read()
187            if file_ext in {'md', 'textile', 'markdown'}:
188                options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
189                log.info('File extension indicates particular formatting. '
190                        'Forcing formatting type to: %s'%options.formatting_type)
191                options.paragraph_type = 'off'
192
193        # Get the encoding of the document.
194        if options.input_encoding:
195            ienc = options.input_encoding
196            log.debug('Using user specified input encoding of %s' % ienc)
197        else:
198            det_encoding = detect(txt[:4096])
199            det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
200            if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
201                    'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
202                    'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
203                # Microsoft Word exports to HTML with encoding incorrectly set to
204                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
205                det_encoding = 'gbk'
206            ienc = det_encoding
207            log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
208        if not ienc:
209            ienc = 'utf-8'
210            log.debug('No input encoding specified and could not auto detect using %s' % ienc)
211        # Remove BOM from start of txt as its presence can confuse markdown
212        import codecs
213        for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
214            if txt.startswith(bom):
215                txt = txt[len(bom):]
216                break
217        txt = txt.decode(ienc, 'replace')
218
219        # Replace entities
220        txt = _ent_pat.sub(xml_entity_to_unicode, txt)
221
222        # Normalize line endings
223        txt = normalize_line_endings(txt)
224
225        # Determine the paragraph type of the document.
226        if options.paragraph_type == 'auto':
227            options.paragraph_type = detect_paragraph_type(txt)
228            if options.paragraph_type == 'unknown':
229                log.debug('Could not reliably determine paragraph type using block')
230                options.paragraph_type = 'block'
231            else:
232                log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
233
234        # Detect formatting
235        if options.formatting_type == 'auto':
236            options.formatting_type = detect_formatting_type(txt)
237            log.debug('Auto detected formatting as %s' % options.formatting_type)
238
239        if options.formatting_type == 'heuristic':
240            setattr(options, 'enable_heuristics', True)
241            setattr(options, 'unwrap_lines', False)
242            setattr(options, 'smarten_punctuation', True)
243
244        # Reformat paragraphs to block formatting based on the detected type.
245        # We don't check for block because the processor assumes block.
246        # single and print at transformed to block for processing.
247        if options.paragraph_type == 'single':
248            txt = separate_paragraphs_single_line(txt)
249        elif options.paragraph_type == 'print':
250            txt = separate_hard_scene_breaks(txt)
251            txt = separate_paragraphs_print_formatted(txt)
252            txt = block_to_single_line(txt)
253        elif options.paragraph_type == 'unformatted':
254            from calibre.ebooks.conversion.utils import HeuristicProcessor
255            # unwrap lines based on punctuation
256            docanalysis = DocAnalysis('txt', txt)
257            length = docanalysis.line_length(.5)
258            preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
259            txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
260            txt = separate_paragraphs_single_line(txt)
261        elif options.paragraph_type == 'block':
262            txt = separate_hard_scene_breaks(txt)
263            txt = block_to_single_line(txt)
264
265        if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
266            docanalysis = DocAnalysis('txt', txt)
267            if not length:
268                length = docanalysis.line_length(.5)
269            dehyphenator = Dehyphenator(options.verbose, log=self.log)
270            txt = dehyphenator(txt,'txt', length)
271
272        # User requested transformation on the text.
273        if options.txt_in_remove_indents:
274            txt = remove_indents(txt)
275
276        # Preserve spaces will replace multiple spaces to a space
277        # followed by the &nbsp; entity.
278        if options.preserve_spaces:
279            txt = preserve_spaces(txt)
280
281        # Process the text using the appropriate text processor.
282        self.shifted_files = []
283        try:
284            html = ''
285            input_mi = None
286            if options.formatting_type == 'markdown':
287                log.debug('Running text through markdown conversion...')
288                try:
289                    input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
290                except RuntimeError:
291                    raise ValueError('This txt file has malformed markup, it cannot be'
292                        ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
293                html = self.fix_resources(html, base_dir)
294            elif options.formatting_type == 'textile':
295                log.debug('Running text through textile conversion...')
296                html = convert_textile(txt)
297                html = self.fix_resources(html, base_dir)
298            else:
299                log.debug('Running text through basic conversion...')
300                flow_size = getattr(options, 'flow_size', 0)
301                html = convert_basic(txt, epub_split_size_kb=flow_size)
302
303            # Run the HTMLized text through the html processing plugin.
304            from calibre.customize.ui import plugin_for_input_format
305            html_input = plugin_for_input_format('html')
306            for opt in html_input.options:
307                setattr(options, opt.option.name, opt.recommended_value)
308            options.input_encoding = 'utf-8'
309            htmlfile = self.shift_file('index.html', html.encode('utf-8'))
310            odi = options.debug_pipeline
311            options.debug_pipeline = None
312            # Generate oeb from html conversion.
313            oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
314            options.debug_pipeline = odi
315        finally:
316            for x in self.shifted_files:
317                os.remove(x)
318
319        # Set metadata from file.
320        if input_mi is None:
321            from calibre.customize.ui import get_file_type_metadata
322            input_mi = get_file_type_metadata(stream, file_ext)
323        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
324        meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
325        self.html_postprocess_title = input_mi.title
326
327        return oeb
328
329    def postprocess_book(self, oeb, opts, log):
330        for item in oeb.spine:
331            if hasattr(item.data, 'xpath'):
332                for title in item.data.xpath('//*[local-name()="title"]'):
333                    if title.text == _('Unknown'):
334                        title.text = self.html_postprocess_title
335