1# -*- coding: utf-8 -*-
2
3
4__license__ = 'GPL 3'
5__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
6__docformat__ = 'restructuredtext en'
7
8import os, re, sys, shutil, pprint, json
9from functools import partial
10
11from calibre.customize.conversion import OptionRecommendation, DummyReporter
12from calibre.customize.ui import input_profiles, output_profiles, \
13        plugin_for_input_format, plugin_for_output_format, \
14        available_input_formats, available_output_formats, \
15        run_plugins_on_preprocess, run_plugins_on_postprocess
16from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
17from calibre.ptempfile import PersistentTemporaryDirectory
18from calibre.utils.date import parse_date
19from calibre.utils.zipfile import ZipFile
20from calibre import (extract, walk, isbytestring, filesystem_encoding,
21        get_types_map)
22from calibre.constants import __version__
23from polyglot.builtins import string_or_bytes
24
25DEBUG_README=b'''
26This debug folder contains snapshots of the e-book as it passes through the
27various stages of conversion. The stages are:
28
29    1. input - This is the result of running the input plugin on the source
30    file. Use this folder to debug the input plugin.
31
32    2. parsed - This is the result of preprocessing and parsing the output of
33    the input plugin. Note that for some input plugins this will be identical to
34    the input sub-folder. Use this folder to debug structure detection,
35    etc.
36
37    3. structure - This corresponds to the stage in the pipeline when structure
38    detection has run, but before the CSS is flattened. Use this folder to
39    debug the CSS flattening, font size conversion, etc.
40
41    4. processed - This corresponds to the e-book as it is passed to the output
42    plugin. Use this folder to debug the output plugin.
43
44'''
45
46
47def supported_input_formats():
48    fmts = available_input_formats()
49    for x in ('zip', 'rar', 'oebzip'):
50        fmts.add(x)
51    return fmts
52
53
54class OptionValues:
55    pass
56
57
58class CompositeProgressReporter:
59
60    def __init__(self, global_min, global_max, global_reporter):
61        self.global_min, self.global_max = global_min, global_max
62        self.global_reporter = global_reporter
63
64    def __call__(self, fraction, msg=''):
65        global_frac = self.global_min + fraction * \
66                (self.global_max - self.global_min)
67        self.global_reporter(global_frac, msg)
68
69
70ARCHIVE_FMTS = ('zip', 'rar', 'oebzip')
71
72
73class Plumber:
74
75    '''
76    The `Plumber` manages the conversion pipeline. An UI should call the methods
77    :method:`merge_ui_recommendations` and then :method:`run`. The plumber will
78    take care of the rest.
79    '''
80
81    metadata_option_names = [
82        'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments',
83        'publisher', 'series', 'series_index', 'rating', 'isbn',
84        'tags', 'book_producer', 'language', 'pubdate', 'timestamp'
85        ]
86
87    def __init__(self, input, output, log, report_progress=DummyReporter(),
88            dummy=False, merge_plugin_recs=True, abort_after_input_dump=False,
89            override_input_metadata=False, for_regex_wizard=False, view_kepub=False):
90        '''
91        :param input: Path to input file.
92        :param output: Path to output file/folder
93        '''
94        if isbytestring(input):
95            input = input.decode(filesystem_encoding)
96        if isbytestring(output):
97            output = output.decode(filesystem_encoding)
98        self.original_input_arg = input
99        self.for_regex_wizard = for_regex_wizard
100        self.input = os.path.abspath(input)
101        self.output = os.path.abspath(output)
102        self.log = log
103        self.ui_reporter = report_progress
104        self.abort_after_input_dump = abort_after_input_dump
105        self.override_input_metadata = override_input_metadata
106
107        # Pipeline options {{{
108        # Initialize the conversion options that are independent of input and
109        # output formats. The input and output plugins can still disable these
110        # options via recommendations.
111        self.pipeline_options = [
112
113OptionRecommendation(name='verbose',
114            recommended_value=0, level=OptionRecommendation.LOW,
115            short_switch='v',
116            help=_('Level of verbosity. Specify multiple times for greater '
117                   'verbosity. Specifying it twice will result in full '
118                   'verbosity, once medium verbosity and zero times least verbosity.')
119        ),
120
121OptionRecommendation(name='debug_pipeline',
122            recommended_value=None, level=OptionRecommendation.LOW,
123            short_switch='d',
124            help=_('Save the output from different stages of the conversion '
125                   'pipeline to the specified '
126                   'folder. Useful if you are unsure at which stage '
127                   'of the conversion process a bug is occurring.')
128        ),
129
130OptionRecommendation(name='input_profile',
131            recommended_value='default', level=OptionRecommendation.LOW,
132            choices=[x.short_name for x in input_profiles()],
133            help=_('Specify the input profile. The input profile gives the '
134                   'conversion system information on how to interpret '
135                   'various information in the input document. For '
136                   'example resolution dependent lengths (i.e. lengths in '
137                   'pixels). Choices are:') + ' ' + ', '.join([
138                       x.short_name for x in input_profiles()])
139        ),
140
141OptionRecommendation(name='output_profile',
142            recommended_value='default', level=OptionRecommendation.LOW,
143            choices=[x.short_name for x in output_profiles()],
144            help=_('Specify the output profile. The output profile '
145                   'tells the conversion system how to optimize the '
146                   'created document for the specified device (such as by resizing images for the device screen size). In some cases, '
147                   'an output profile can be used to optimize the output for a particular device, but this is rarely necessary. '
148                   'Choices are:') + ', '.join([
149                       x.short_name for x in output_profiles()])
150        ),
151
152OptionRecommendation(name='base_font_size',
153            recommended_value=0, level=OptionRecommendation.LOW,
154            help=_('The base font size in pts. All font sizes in the produced book '
155                   'will be rescaled based on this size. By choosing a larger '
156                   'size you can make the fonts in the output bigger and vice '
157                   'versa. By default, when the value is zero, the base font size is chosen based on '
158                   'the output profile you chose.'
159                   )
160        ),
161
162OptionRecommendation(name='font_size_mapping',
163            recommended_value=None, level=OptionRecommendation.LOW,
164            help=_('Mapping from CSS font names to font sizes in pts. '
165                   'An example setting is 12,12,14,16,18,20,22,24. '
166                   'These are the mappings for the sizes xx-small to xx-large, '
167                   'with the final size being for huge fonts. The font '
168                   'rescaling algorithm uses these sizes to intelligently '
169                   'rescale fonts. The default is to use a mapping based on '
170                   'the output profile you chose.'
171                   )
172        ),
173
174OptionRecommendation(name='disable_font_rescaling',
175            recommended_value=False, level=OptionRecommendation.LOW,
176            help=_('Disable all rescaling of font sizes.'
177                   )
178        ),
179
180OptionRecommendation(name='minimum_line_height',
181            recommended_value=120.0, level=OptionRecommendation.LOW,
182            help=_(
183            'The minimum line height, as a percentage of the element\'s '
184            'calculated font size. calibre will ensure that every element '
185            'has a line height of at least this setting, irrespective of '
186            'what the input document specifies. Set to zero to disable. '
187            'Default is 120%. Use this setting in preference to '
188            'the direct line height specification, unless you know what '
189            'you are doing. For example, you can achieve "double spaced" '
190            'text by setting this to 240.'
191            )
192        ),
193
194
195OptionRecommendation(name='line_height',
196            recommended_value=0, level=OptionRecommendation.LOW,
197            help=_(
198            'The line height in pts. Controls spacing between consecutive '
199            'lines of text. Only applies to elements that do not define '
200            'their own line height. In most cases, the minimum line height '
201            'option is more useful. '
202            'By default no line height manipulation is performed.'
203            )
204        ),
205
206OptionRecommendation(name='embed_font_family',
207        recommended_value=None, level=OptionRecommendation.LOW,
208        help=_(
209            'Embed the specified font family into the book. This specifies '
210            'the "base" font used for the book. If the input document '
211            'specifies its own fonts, they may override this base font. '
212            'You can use the filter style information option to remove fonts from the '
213            'input document. Note that font embedding only works '
214            'with some output formats, principally EPUB, AZW3 and DOCX.')
215        ),
216
217OptionRecommendation(name='embed_all_fonts',
218        recommended_value=False, level=OptionRecommendation.LOW,
219        help=_(
220            'Embed every font that is referenced in the input document '
221            'but not already embedded. This will search your system for the '
222            'fonts, and if found, they will be embedded. Embedding will only work '
223            'if the format you are converting to supports embedded fonts, such as '
224            'EPUB, AZW3, DOCX or PDF. Please ensure that you have the proper license for embedding '
225            'the fonts used in this document.'
226        )),
227
228OptionRecommendation(name='subset_embedded_fonts',
229        recommended_value=False, level=OptionRecommendation.LOW,
230        help=_(
231            'Subset all embedded fonts. Every embedded font is reduced '
232            'to contain only the glyphs used in this document. This decreases '
233            'the size of the font files. Useful if you are embedding a '
234            'particularly large font with lots of unused glyphs.')
235        ),
236
237OptionRecommendation(name='linearize_tables',
238            recommended_value=False, level=OptionRecommendation.LOW,
239            help=_('Some badly designed documents use tables to control the '
240                'layout of text on the page. When converted these documents '
241                'often have text that runs off the page and other artifacts. '
242                'This option will extract the content from the tables and '
243                'present it in a linear fashion.'
244                )
245        ),
246
247OptionRecommendation(name='level1_toc',
248            recommended_value=None, level=OptionRecommendation.LOW,
249            help=_('XPath expression that specifies all tags that '
250            'should be added to the Table of Contents at level one. If '
251            'this is specified, it takes precedence over other forms '
252            'of auto-detection.'
253            ' See the XPath Tutorial in the calibre User Manual for examples.'
254                )
255        ),
256
257OptionRecommendation(name='level2_toc',
258            recommended_value=None, level=OptionRecommendation.LOW,
259            help=_('XPath expression that specifies all tags that should be '
260            'added to the Table of Contents at level two. Each entry is added '
261            'under the previous level one entry.'
262            ' See the XPath Tutorial in the calibre User Manual for examples.'
263                )
264        ),
265
266OptionRecommendation(name='level3_toc',
267            recommended_value=None, level=OptionRecommendation.LOW,
268            help=_('XPath expression that specifies all tags that should be '
269            'added to the Table of Contents at level three. Each entry '
270            'is added under the previous level two entry.'
271            ' See the XPath Tutorial in the calibre User Manual for examples.'
272                )
273        ),
274
275OptionRecommendation(name='use_auto_toc',
276            recommended_value=False, level=OptionRecommendation.LOW,
277            help=_('Normally, if the source file already has a Table of '
278            'Contents, it is used in preference to the auto-generated one. '
279            'With this option, the auto-generated one is always used.'
280                )
281        ),
282
283OptionRecommendation(name='no_chapters_in_toc',
284            recommended_value=False, level=OptionRecommendation.LOW,
285            help=_("Don't add auto-detected chapters to the Table of "
286            'Contents.'
287                )
288        ),
289
290OptionRecommendation(name='toc_threshold',
291            recommended_value=6, level=OptionRecommendation.LOW,
292            help=_(
293        'If fewer than this number of chapters is detected, then links '
294        'are added to the Table of Contents. Default: %default')
295        ),
296
297OptionRecommendation(name='max_toc_links',
298            recommended_value=50, level=OptionRecommendation.LOW,
299            help=_('Maximum number of links to insert into the TOC. Set to 0 '
300                'to disable. Default is: %default. Links are only added to the '
301                'TOC if less than the threshold number of chapters were detected.'
302                )
303        ),
304
305OptionRecommendation(name='toc_filter',
306            recommended_value=None, level=OptionRecommendation.LOW,
307            help=_('Remove entries from the Table of Contents whose titles '
308            'match the specified regular expression. Matching entries and all '
309            'their children are removed.'
310                )
311        ),
312
313OptionRecommendation(name='duplicate_links_in_toc',
314            recommended_value=False, level=OptionRecommendation.LOW,
315            help=_('When creating a TOC from links in the input document, '
316                'allow duplicate entries, i.e. allow more than one entry '
317                'with the same text, provided that they point to a '
318                'different location.')
319        ),
320
321
322OptionRecommendation(name='chapter',
323        recommended_value="//*[((name()='h1' or name()='h2') and "
324              r"re:test(., '\s*((chapter|book|section|part)\s+)|((prolog|prologue|epilogue)(\s+|$))', 'i')) or @class "
325              "= 'chapter']", level=OptionRecommendation.LOW,
326            help=_('An XPath expression to detect chapter titles. The default '
327                'is to consider <h1> or <h2> tags that contain the words '
328                '"chapter", "book", "section", "prologue", "epilogue" or "part" as chapter titles as '
329                'well as any tags that have class="chapter". The expression '
330                'used must evaluate to a list of elements. To disable chapter '
331                'detection, use the expression "/". See the XPath Tutorial '
332                'in the calibre User Manual for further help on using this '
333                'feature.'
334                )
335        ),
336
337OptionRecommendation(name='chapter_mark',
338            recommended_value='pagebreak', level=OptionRecommendation.LOW,
339            choices=['pagebreak', 'rule', 'both', 'none'],
340            help=_('Specify how to mark detected chapters. A value of '
341                    '"pagebreak" will insert page breaks before chapters. '
342                    'A value of "rule" will insert a line before chapters. '
343                    'A value of "none" will disable chapter marking and a '
344                    'value of "both" will use both page breaks and lines '
345                    'to mark chapters.')
346        ),
347
348OptionRecommendation(name='start_reading_at',
349        recommended_value=None, level=OptionRecommendation.LOW,
350        help=_('An XPath expression to detect the location in the document'
351            ' at which to start reading. Some e-book reading programs'
352            ' (most prominently the Kindle) use this location as the'
353            ' position at which to open the book. See the XPath tutorial'
354            ' in the calibre User Manual for further help using this'
355            ' feature.')
356        ),
357
358OptionRecommendation(name='extra_css',
359            recommended_value=None, level=OptionRecommendation.LOW,
360            help=_('Either the path to a CSS stylesheet or raw CSS. '
361                'This CSS will be appended to the style rules from '
362                'the source file, so it can be used to override those '
363                'rules.')
364        ),
365
366OptionRecommendation(name='transform_css_rules',
367            recommended_value=None, level=OptionRecommendation.LOW,
368            help=_('Rules for transforming the styles in this book. These'
369                   ' rules are applied after all other CSS processing is done.')
370        ),
371
372OptionRecommendation(name='transform_html_rules',
373            recommended_value=None, level=OptionRecommendation.LOW,
374            help=_('Rules for transforming the HTML in this book. These'
375                   ' rules are applied after the HTML is parsed, but before any other transformations.')
376        ),
377
378OptionRecommendation(name='filter_css',
379            recommended_value=None, level=OptionRecommendation.LOW,
380            help=_('A comma separated list of CSS properties that '
381                'will be removed from all CSS style rules. This is useful '
382                'if the presence of some style information prevents it '
383                'from being overridden on your device. '
384                'For example: '
385                'font-family,color,margin-left,margin-right')
386        ),
387
388OptionRecommendation(name='expand_css',
389            recommended_value=False, level=OptionRecommendation.LOW,
390            help=_(
391                'By default, calibre will use the shorthand form for various'
392                ' CSS properties such as margin, padding, border, etc. This'
393                ' option will cause it to use the full expanded form instead.'
394                ' Note that CSS is always expanded when generating EPUB files'
395                ' with the output profile set to one of the Nook profiles'
396                ' as the Nook cannot handle shorthand CSS.')
397        ),
398
399OptionRecommendation(name='page_breaks_before',
400            recommended_value="//*[name()='h1' or name()='h2']",
401            level=OptionRecommendation.LOW,
402            help=_('An XPath expression. Page breaks are inserted '
403                'before the specified elements. To disable use the expression: /')
404        ),
405
406OptionRecommendation(name='remove_fake_margins',
407            recommended_value=True, level=OptionRecommendation.LOW,
408            help=_('Some documents specify page margins by '
409                'specifying a left and right margin on each individual '
410                'paragraph. calibre will try to detect and remove these '
411                'margins. Sometimes, this can cause the removal of '
412                'margins that should not have been removed. In this '
413                'case you can disable the removal.')
414        ),
415
416
417OptionRecommendation(name='margin_top',
418        recommended_value=5.0, level=OptionRecommendation.LOW,
419        help=_('Set the top margin in pts. Default is %default. '
420            'Setting this to less than zero will cause no margin to be set '
421            '(the margin setting in the original document will be preserved). '
422            'Note: Page oriented formats such as PDF and DOCX have their own'
423            ' margin settings that take precedence.')),
424
425OptionRecommendation(name='margin_bottom',
426        recommended_value=5.0, level=OptionRecommendation.LOW,
427        help=_('Set the bottom margin in pts. Default is %default. '
428            'Setting this to less than zero will cause no margin to be set '
429            '(the margin setting in the original document will be preserved). '
430            'Note: Page oriented formats such as PDF and DOCX have their own'
431            ' margin settings that take precedence.')),
432
433OptionRecommendation(name='margin_left',
434        recommended_value=5.0, level=OptionRecommendation.LOW,
435        help=_('Set the left margin in pts. Default is %default. '
436            'Setting this to less than zero will cause no margin to be set '
437            '(the margin setting in the original document will be preserved). '
438            'Note: Page oriented formats such as PDF and DOCX have their own'
439            ' margin settings that take precedence.')),
440
441OptionRecommendation(name='margin_right',
442        recommended_value=5.0, level=OptionRecommendation.LOW,
443        help=_('Set the right margin in pts. Default is %default. '
444            'Setting this to less than zero will cause no margin to be set '
445            '(the margin setting in the original document will be preserved). '
446            'Note: Page oriented formats such as PDF and DOCX have their own'
447            ' margin settings that take precedence.')),
448
449OptionRecommendation(name='change_justification',
450        recommended_value='original', level=OptionRecommendation.LOW,
451        choices=['left','justify','original'],
452        help=_('Change text justification. A value of "left" converts all'
453            ' justified text in the source to left aligned (i.e. '
454            'unjustified) text. A value of "justify" converts all '
455            'unjustified text to justified. A value of "original" '
456            '(the default) does not change justification in the '
457            'source file. Note that only some output formats support '
458            'justification.')),
459
460OptionRecommendation(name='remove_paragraph_spacing',
461        recommended_value=False, level=OptionRecommendation.LOW,
462        help=_('Remove spacing between paragraphs. Also sets an indent on '
463        'paragraphs of 1.5em. Spacing removal will not work '
464        'if the source file does not use paragraphs (<p> or <div> tags).')
465        ),
466
467OptionRecommendation(name='remove_paragraph_spacing_indent_size',
468        recommended_value=1.5, level=OptionRecommendation.LOW,
469        help=_('When calibre removes blank lines between paragraphs, it automatically '
470            'sets a paragraph indent, to ensure that paragraphs can be easily '
471            'distinguished. This option controls the width of that indent (in em). '
472            'If you set this value negative, then the indent specified in the input '
473            'document is used, that is, calibre does not change the indentation.')
474        ),
475
476OptionRecommendation(name='prefer_metadata_cover',
477        recommended_value=False, level=OptionRecommendation.LOW,
478        help=_('Use the cover detected from the source file in preference '
479        'to the specified cover.')
480        ),
481
482OptionRecommendation(name='insert_blank_line',
483        recommended_value=False, level=OptionRecommendation.LOW,
484        help=_('Insert a blank line between paragraphs. Will not work '
485            'if the source file does not use paragraphs (<p> or <div> tags).'
486            )
487        ),
488
489OptionRecommendation(name='insert_blank_line_size',
490        recommended_value=0.5, level=OptionRecommendation.LOW,
491        help=_('Set the height of the inserted blank lines (in em).'
492            ' The height of the lines between paragraphs will be twice the value'
493            ' set here.')
494        ),
495
496OptionRecommendation(name='remove_first_image',
497        recommended_value=False, level=OptionRecommendation.LOW,
498        help=_('Remove the first image from the input e-book. Useful if the '
499        'input document has a cover image that is not identified as a cover. '
500        'In this case, if you set a cover in calibre, the output document will '
501        'end up with two cover images if you do not specify this option.'
502            )
503        ),
504
505OptionRecommendation(name='insert_metadata',
506        recommended_value=False, level=OptionRecommendation.LOW,
507        help=_('Insert the book metadata at the start of '
508            'the book. This is useful if your e-book reader does not support '
509            'displaying/searching metadata directly.'
510            )
511        ),
512
513OptionRecommendation(name='smarten_punctuation',
514        recommended_value=False, level=OptionRecommendation.LOW,
515        help=_('Convert plain quotes, dashes and ellipsis to their '
516            'typographically correct equivalents. For details, see '
517            'https://daringfireball.net/projects/smartypants.'
518            )
519        ),
520
521OptionRecommendation(name='unsmarten_punctuation',
522        recommended_value=False, level=OptionRecommendation.LOW,
523        help=_('Convert fancy quotes, dashes and ellipsis to their '
524               'plain equivalents.'
525            )
526        ),
527
528OptionRecommendation(name='read_metadata_from_opf',
529            recommended_value=None, level=OptionRecommendation.LOW,
530            short_switch='m',
531            help=_('Read metadata from the specified OPF file. Metadata read '
532                   'from this file will override any metadata in the source '
533                   'file.')
534        ),
535
536OptionRecommendation(name='asciiize',
537        recommended_value=False, level=OptionRecommendation.LOW,
538        help=(_('Transliterate Unicode characters to an ASCII '
539            'representation. Use with care because this will replace '
540            'Unicode characters with ASCII. For instance it will replace "{0}" '
541            'with "{1}". Also, note that in '
542            'cases where there are multiple representations of a character '
543            '(characters shared by Chinese and Japanese for instance) the '
544            'representation based on the current calibre interface language will be '
545            'used.').format('Pelé', 'Pele'))
546        ),
547
548OptionRecommendation(name='keep_ligatures',
549            recommended_value=False, level=OptionRecommendation.LOW,
550            help=_('Preserve ligatures present in the input document. '
551                'A ligature is a special rendering of a pair of '
552                'characters like ff, fi, fl et cetera. '
553                'Most readers do not have support for '
554                'ligatures in their default fonts, so they are '
555                'unlikely to render correctly. By default, calibre '
556                'will turn a ligature into the corresponding pair of normal '
557                'characters. This option will preserve them instead.')
558        ),
559
560OptionRecommendation(name='title',
561    recommended_value=None, level=OptionRecommendation.LOW,
562    help=_('Set the title.')),
563
564OptionRecommendation(name='authors',
565    recommended_value=None, level=OptionRecommendation.LOW,
566    help=_('Set the authors. Multiple authors should be separated by '
567    'ampersands.')),
568
569OptionRecommendation(name='title_sort',
570    recommended_value=None, level=OptionRecommendation.LOW,
571    help=_('The version of the title to be used for sorting. ')),
572
573OptionRecommendation(name='author_sort',
574    recommended_value=None, level=OptionRecommendation.LOW,
575    help=_('String to be used when sorting by author. ')),
576
577OptionRecommendation(name='cover',
578    recommended_value=None, level=OptionRecommendation.LOW,
579    help=_('Set the cover to the specified file or URL')),
580
581OptionRecommendation(name='comments',
582    recommended_value=None, level=OptionRecommendation.LOW,
583    help=_('Set the e-book description.')),
584
585OptionRecommendation(name='publisher',
586    recommended_value=None, level=OptionRecommendation.LOW,
587    help=_('Set the e-book publisher.')),
588
589OptionRecommendation(name='series',
590    recommended_value=None, level=OptionRecommendation.LOW,
591    help=_('Set the series this e-book belongs to.')),
592
593OptionRecommendation(name='series_index',
594    recommended_value=None, level=OptionRecommendation.LOW,
595    help=_('Set the index of the book in this series.')),
596
597OptionRecommendation(name='rating',
598    recommended_value=None, level=OptionRecommendation.LOW,
599    help=_('Set the rating. Should be a number between 1 and 5.')),
600
601OptionRecommendation(name='isbn',
602    recommended_value=None, level=OptionRecommendation.LOW,
603    help=_('Set the ISBN of the book.')),
604
605OptionRecommendation(name='tags',
606    recommended_value=None, level=OptionRecommendation.LOW,
607    help=_('Set the tags for the book. Should be a comma separated list.')),
608
609OptionRecommendation(name='book_producer',
610    recommended_value=None, level=OptionRecommendation.LOW,
611    help=_('Set the book producer.')),
612
613OptionRecommendation(name='language',
614    recommended_value=None, level=OptionRecommendation.LOW,
615    help=_('Set the language.')),
616
617OptionRecommendation(name='pubdate',
618    recommended_value=None, level=OptionRecommendation.LOW,
619    help=_('Set the publication date (assumed to be in the local timezone, unless the timezone is explicitly specified)')),
620
621OptionRecommendation(name='timestamp',
622    recommended_value=None, level=OptionRecommendation.LOW,
623    help=_('Set the book timestamp (no longer used anywhere)')),
624
625OptionRecommendation(name='enable_heuristics',
626    recommended_value=False, level=OptionRecommendation.LOW,
627    help=_('Enable heuristic processing. This option must be set for any '
628           'heuristic processing to take place.')),
629
630OptionRecommendation(name='markup_chapter_headings',
631    recommended_value=True, level=OptionRecommendation.LOW,
632    help=_('Detect unformatted chapter headings and sub headings. Change '
633           'them to h2 and h3 tags.  This setting will not create a TOC, '
634           'but can be used in conjunction with structure detection to create '
635           'one.')),
636
637OptionRecommendation(name='italicize_common_cases',
638    recommended_value=True, level=OptionRecommendation.LOW,
639    help=_('Look for common words and patterns that denote '
640           'italics and italicize them.')),
641
642OptionRecommendation(name='fix_indents',
643    recommended_value=True, level=OptionRecommendation.LOW,
644    help=_('Turn indentation created from multiple non-breaking space entities '
645           'into CSS indents.')),
646
647OptionRecommendation(name='html_unwrap_factor',
648    recommended_value=0.40, level=OptionRecommendation.LOW,
649    help=_('Scale used to determine the length at which a line should '
650            'be unwrapped. Valid values are a decimal between 0 and 1. The '
651            'default is 0.4, just below the median line length.  If only a '
652            'few lines in the document require unwrapping this value should '
653            'be reduced')),
654
655OptionRecommendation(name='unwrap_lines',
656    recommended_value=True, level=OptionRecommendation.LOW,
657    help=_('Unwrap lines using punctuation and other formatting clues.')),
658
659OptionRecommendation(name='delete_blank_paragraphs',
660    recommended_value=True, level=OptionRecommendation.LOW,
661    help=_('Remove empty paragraphs from the document when they exist between '
662           'every other paragraph')),
663
664OptionRecommendation(name='format_scene_breaks',
665    recommended_value=True, level=OptionRecommendation.LOW,
666    help=_('Left aligned scene break markers are center aligned. '
667           'Replace soft scene breaks that use multiple blank lines with '
668           'horizontal rules.')),
669
670OptionRecommendation(name='replace_scene_breaks',
671    recommended_value='', level=OptionRecommendation.LOW,
672    help=_('Replace scene breaks with the specified text. By default, the '
673        'text from the input document is used.')),
674
675OptionRecommendation(name='dehyphenate',
676    recommended_value=True, level=OptionRecommendation.LOW,
677    help=_('Analyze hyphenated words throughout the document.  The '
678           'document itself is used as a dictionary to determine whether hyphens '
679           'should be retained or removed.')),
680
681OptionRecommendation(name='renumber_headings',
682    recommended_value=True, level=OptionRecommendation.LOW,
683    help=_('Looks for occurrences of sequential <h1> or <h2> tags. '
684           'The tags are renumbered to prevent splitting in the middle '
685           'of chapter headings.')),
686
687OptionRecommendation(name='sr1_search',
688    recommended_value='', level=OptionRecommendation.LOW,
689    help=_('Search pattern (regular expression) to be replaced with '
690           'sr1-replace.')),
691
692OptionRecommendation(name='sr1_replace',
693    recommended_value='', level=OptionRecommendation.LOW,
694    help=_('Replacement to replace the text found with sr1-search.')),
695
696OptionRecommendation(name='sr2_search',
697    recommended_value='', level=OptionRecommendation.LOW,
698    help=_('Search pattern (regular expression) to be replaced with '
699           'sr2-replace.')),
700
701OptionRecommendation(name='sr2_replace',
702    recommended_value='', level=OptionRecommendation.LOW,
703    help=_('Replacement to replace the text found with sr2-search.')),
704
705OptionRecommendation(name='sr3_search',
706    recommended_value='', level=OptionRecommendation.LOW,
707    help=_('Search pattern (regular expression) to be replaced with '
708           'sr3-replace.')),
709
710OptionRecommendation(name='sr3_replace',
711    recommended_value='', level=OptionRecommendation.LOW,
712    help=_('Replacement to replace the text found with sr3-search.')),
713
714OptionRecommendation(name='search_replace',
715    recommended_value=None, level=OptionRecommendation.LOW, help=_(
716        'Path to a file containing search and replace regular expressions. '
717        'The file must contain alternating lines of regular expression '
718        'followed by replacement pattern (which can be an empty line). '
719        'The regular expression must be in the Python regex syntax and '
720        'the file must be UTF-8 encoded.')),
721]
722        # }}}
723
724        input_fmt = os.path.splitext(self.input)[1]
725        if not input_fmt:
726            raise ValueError('Input file must have an extension')
727        input_fmt = input_fmt[1:].lower().replace('original_', '')
728        if view_kepub and input_fmt.lower() == 'kepub':
729            input_fmt = 'epub'
730        self.archive_input_tdir = None
731        self.changed_options = set()
732        if input_fmt in ARCHIVE_FMTS:
733            self.log('Processing archive...')
734            tdir = PersistentTemporaryDirectory('_pl_arc')
735            self.input, input_fmt = self.unarchive(self.input, tdir)
736            self.archive_input_tdir = tdir
737        if os.access(self.input, os.R_OK):
738            nfp = run_plugins_on_preprocess(self.input, input_fmt)
739            if nfp != self.input:
740                self.input = nfp
741                input_fmt = os.path.splitext(self.input)[1]
742                if not input_fmt:
743                    raise ValueError('Input file must have an extension')
744                input_fmt = input_fmt[1:].lower()
745
746        if os.path.exists(self.output) and os.path.isdir(self.output):
747            output_fmt = 'oeb'
748        else:
749            output_fmt = os.path.splitext(self.output)[1]
750            if not output_fmt:
751                output_fmt = '.oeb'
752            output_fmt = output_fmt[1:].lower()
753
754        self.input_plugin  = plugin_for_input_format(input_fmt)
755        self.output_plugin = plugin_for_output_format(output_fmt)
756
757        if self.input_plugin is None:
758            raise ValueError('No plugin to handle input format: '+input_fmt)
759
760        if self.output_plugin is None:
761            raise ValueError('No plugin to handle output format: '+output_fmt)
762
763        self.input_fmt = input_fmt
764        self.output_fmt = output_fmt
765
766        self.all_format_options = set()
767        self.input_options = set()
768        self.output_options = set()
769        # Build set of all possible options. Two options are equal if their
770        # names are the same.
771        if not dummy:
772            self.input_options  = self.input_plugin.options.union(
773                                        self.input_plugin.common_options)
774            self.output_options = self.output_plugin.options.union(
775                                    self.output_plugin.common_options)
776        else:
777            for fmt in available_input_formats():
778                input_plugin = plugin_for_input_format(fmt)
779                if input_plugin:
780                    self.all_format_options = self.all_format_options.union(
781                        input_plugin.options.union(input_plugin.common_options))
782            for fmt in available_output_formats():
783                output_plugin = plugin_for_output_format(fmt)
784                if output_plugin:
785                    self.all_format_options = self.all_format_options.union(
786                        output_plugin.options.union(output_plugin.common_options))
787
788        # Remove the options that have been disabled by recommendations from the
789        # plugins.
790        for w in ('input_options', 'output_options',
791                'all_format_options'):
792            temp = set()
793            for x in getattr(self, w):
794                temp.add(x.clone())
795            setattr(self, w, temp)
796        if merge_plugin_recs:
797            self.merge_plugin_recommendations()
798
799    @classmethod
800    def unarchive(self, path, tdir):
801        extract(path, tdir)
802        files = list(walk(tdir))
803        files = [f if isinstance(f, str) else f.decode(filesystem_encoding)
804                for f in files]
805        from calibre.customize.ui import available_input_formats
806        fmts = set(available_input_formats())
807        fmts -= {'htm', 'html', 'xhtm', 'xhtml'}
808        fmts -= set(ARCHIVE_FMTS)
809
810        for ext in fmts:
811            for f in files:
812                if f.lower().endswith('.'+ext):
813                    if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048:
814                        continue
815                    return f, ext
816        return self.find_html_index(files)
817
818    @classmethod
819    def find_html_index(self, files):
820        '''
821        Given a list of files, find the most likely root HTML file in the
822        list.
823        '''
824        html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE)
825        html_files = [f for f in files if html_pat.search(f) is not None]
826        if not html_files:
827            raise ValueError(_('Could not find an e-book inside the archive'))
828        html_files = [(f, os.stat(f).st_size) for f in html_files]
829        html_files.sort(key=lambda x: x[1])
830        html_files = [f[0] for f in html_files]
831        for q in ('toc', 'index'):
832            for f in html_files:
833                if os.path.splitext(os.path.basename(f))[0].lower() == q:
834                    return f, os.path.splitext(f)[1].lower()[1:]
835        return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
836
837    def get_all_options(self):
838        ans = {}
839        for group in (self.input_options, self.pipeline_options,
840                      self.output_options, self.all_format_options):
841            for rec in group:
842                ans[rec.option] = rec.recommended_value
843        return ans
844
845    def get_option_by_name(self, name):
846        for group in (self.input_options, self.pipeline_options,
847                      self.output_options, self.all_format_options):
848            for rec in group:
849                if rec.option == name:
850                    return rec
851
852    def get_option_help(self, name):
853        rec = self.get_option_by_name(name)
854        help = getattr(rec, 'help', None)
855        if help is not None:
856            return help.replace('%default', str(rec.recommended_value))
857
858    def get_all_help(self):
859        ans = {}
860        for group in (self.input_options, self.pipeline_options,
861                      self.output_options, self.all_format_options):
862            for rec in group:
863                help = getattr(rec, 'help', None)
864                if help is not None:
865                    ans[rec.option.name] = help
866        return ans
867
868    def merge_plugin_recs(self, plugin):
869        for name, val, level in plugin.recommendations:
870            rec = self.get_option_by_name(name)
871            if rec is not None and rec.level <= level:
872                rec.recommended_value = val
873                rec.level = level
874
875    def merge_plugin_recommendations(self):
876        for source in (self.input_plugin, self.output_plugin):
877            self.merge_plugin_recs(source)
878
879    def merge_ui_recommendations(self, recommendations):
880        '''
881        Merge recommendations from the UI. As long as the UI recommendation
882        level is >= the baseline recommended level, the UI value is used,
883        *except* if the baseline has a recommendation level of `HIGH`.
884        '''
885
886        def eq(name, a, b):
887            if name in {'sr1_search', 'sr1_replace', 'sr2_search', 'sr2_replace', 'sr3_search', 'sr3_replace', 'filter_css', 'comments'}:
888                if not a and not b:
889                    return True
890            if name in {'transform_css_rules', 'transform_html_rules', 'search_replace'}:
891                if b == '[]':
892                    b = None
893            return a == b
894
895        for name, val, level in recommendations:
896            rec = self.get_option_by_name(name)
897            if rec is not None and rec.level <= level and rec.level < rec.HIGH:
898                changed = not eq(name, rec.recommended_value, val)
899                rec.recommended_value = val
900                rec.level = level
901                if changed:
902                    self.changed_options.add(rec)
903
904    def opts_to_mi(self, mi):
905        from calibre.ebooks.metadata import string_to_authors
906        for x in self.metadata_option_names:
907            val = getattr(self.opts, x, None)
908            if val is not None:
909                if x == 'authors':
910                    val = string_to_authors(val)
911                elif x == 'tags':
912                    val = [i.strip() for i in val.split(',')]
913                elif x in ('rating', 'series_index'):
914                    try:
915                        val = float(val)
916                    except ValueError:
917                        self.log.warn(_('Values of series index and rating must'
918                        ' be numbers. Ignoring'), val)
919                        continue
920                elif x in ('timestamp', 'pubdate'):
921                    try:
922                        val = parse_date(val, assume_utc=x=='timestamp')
923                    except:
924                        self.log.exception(_('Failed to parse date/time') + ' ' + str(val))
925                        continue
926                setattr(mi, x, val)
927
928    def download_cover(self, url):
929        from calibre import browser
930        from PIL import Image
931        import io
932        from calibre.ptempfile import PersistentTemporaryFile
933        self.log('Downloading cover from %r'%url)
934        br = browser()
935        raw = br.open_novisit(url).read()
936        buf = io.BytesIO(raw)
937        pt = PersistentTemporaryFile('.jpg')
938        pt.close()
939        img = Image.open(buf)
940        img.convert('RGB').save(pt.name)
941        return pt.name
942
943    def read_user_metadata(self):
944        '''
945        Read all metadata specified by the user. Command line options override
946        metadata from a specified OPF file.
947        '''
948        from calibre.ebooks.metadata import MetaInformation
949        from calibre.ebooks.metadata.opf2 import OPF
950        mi = MetaInformation(None, [])
951        if self.opts.read_metadata_from_opf is not None:
952            self.opts.read_metadata_from_opf = os.path.abspath(
953                                            self.opts.read_metadata_from_opf)
954            with lopen(self.opts.read_metadata_from_opf, 'rb') as stream:
955                opf = OPF(stream, os.path.dirname(self.opts.read_metadata_from_opf))
956            mi = opf.to_book_metadata()
957        self.opts_to_mi(mi)
958        if mi.cover:
959            if mi.cover.startswith('http:') or mi.cover.startswith('https:'):
960                mi.cover = self.download_cover(mi.cover)
961            ext = mi.cover.rpartition('.')[-1].lower().strip()
962            if ext not in ('png', 'jpg', 'jpeg', 'gif'):
963                ext = 'jpg'
964            with lopen(mi.cover, 'rb') as stream:
965                mi.cover_data = (ext, stream.read())
966            mi.cover = None
967        self.user_metadata = mi
968
969    def setup_options(self):
970        '''
971        Setup the `self.opts` object.
972        '''
973        self.opts = OptionValues()
974        for group in (self.input_options, self.pipeline_options,
975                  self.output_options, self.all_format_options):
976            for rec in group:
977                setattr(self.opts, rec.option.name, rec.recommended_value)
978
979        def set_profile(profiles, which):
980            attr = which + '_profile'
981            sval = getattr(self.opts, attr)
982            for x in profiles():
983                if x.short_name == sval:
984                    setattr(self.opts, attr, x)
985                    return
986            self.log.warn(
987                'Profile (%s) %r is no longer available, using default'%(which, sval))
988            for x in profiles():
989                if x.short_name == 'default':
990                    setattr(self.opts, attr, x)
991                    break
992
993        set_profile(input_profiles, 'input')
994        set_profile(output_profiles, 'output')
995
996        self.read_user_metadata()
997        self.opts.no_inline_navbars = self.opts.output_profile.supports_mobi_indexing \
998                and self.output_fmt == 'mobi'
999        if self.opts.verbose:
1000            self.log.filter_level = self.log.DEBUG
1001        if self.changed_options:
1002            self.log('Conversion options changed from defaults:')
1003            for rec in self.changed_options:
1004                if rec.option.name not in ('username', 'password'):
1005                    self.log(' ', '%s:' % rec.option.name, repr(rec.recommended_value))
1006        if self.opts.verbose > 1:
1007            self.log.debug('Resolved conversion options')
1008            try:
1009                self.log.debug('calibre version:', __version__)
1010                odict = dict(self.opts.__dict__)
1011                for x in ('username', 'password'):
1012                    odict.pop(x, None)
1013                self.log.debug(pprint.pformat(odict))
1014            except:
1015                self.log.exception('Failed to get resolved conversion options')
1016
1017    def flush(self):
1018        try:
1019            sys.stdout.flush()
1020            sys.stderr.flush()
1021        except Exception:
1022            pass
1023
1024    def dump_oeb(self, oeb, out_dir):
1025        from calibre.ebooks.oeb.writer import OEBWriter
1026        w = OEBWriter(pretty_print=self.opts.pretty_print)
1027        w(oeb, out_dir)
1028
1029    def dump_input(self, ret, output_dir):
1030        out_dir = os.path.join(self.opts.debug_pipeline, 'input')
1031        if isinstance(ret, string_or_bytes):
1032            shutil.copytree(output_dir, out_dir)
1033        else:
1034            if not os.path.exists(out_dir):
1035                os.makedirs(out_dir)
1036            self.dump_oeb(ret, out_dir)
1037        if self.input_fmt == 'recipe':
1038            zf = ZipFile(os.path.join(self.opts.debug_pipeline,
1039                'periodical.downloaded_recipe'), 'w')
1040            zf.add_dir(out_dir)
1041            with self.input_plugin:
1042                self.input_plugin.save_download(zf)
1043            zf.close()
1044
1045        self.log.info('Input debug saved to:', out_dir)
1046
1047    def run(self):
1048        '''
1049        Run the conversion pipeline
1050        '''
1051        # Setup baseline option values
1052        self.setup_options()
1053        if self.opts.verbose:
1054            self.log.filter_level = self.log.DEBUG
1055        if self.for_regex_wizard and hasattr(self.opts, 'no_process'):
1056            self.opts.no_process = True
1057        self.flush()
1058        if self.opts.embed_all_fonts or self.opts.embed_font_family:
1059            # Start the threaded font scanner now, for performance
1060            from calibre.utils.fonts.scanner import font_scanner  # noqa
1061        import css_parser, logging
1062        css_parser.log.setLevel(logging.WARN)
1063        get_types_map()  # Ensure the mimetypes module is initialized
1064
1065        if self.opts.debug_pipeline is not None:
1066            self.opts.verbose = max(self.opts.verbose, 4)
1067            self.opts.debug_pipeline = os.path.abspath(self.opts.debug_pipeline)
1068            if not os.path.exists(self.opts.debug_pipeline):
1069                os.makedirs(self.opts.debug_pipeline)
1070            with lopen(os.path.join(self.opts.debug_pipeline, 'README.txt'), 'wb') as f:
1071                f.write(DEBUG_README)
1072            for x in ('input', 'parsed', 'structure', 'processed'):
1073                x = os.path.join(self.opts.debug_pipeline, x)
1074                if os.path.exists(x):
1075                    shutil.rmtree(x)
1076
1077        # Run any preprocess plugins
1078        from calibre.customize.ui import run_plugins_on_preprocess
1079        self.input = run_plugins_on_preprocess(self.input)
1080
1081        self.flush()
1082        # Create an OEBBook from the input file. The input plugin does all the
1083        # heavy lifting.
1084        accelerators = {}
1085
1086        tdir = PersistentTemporaryDirectory('_plumber')
1087        stream = self.input if self.input_fmt == 'recipe' else \
1088                lopen(self.input, 'rb')
1089        if self.input_fmt == 'recipe':
1090            self.opts.original_recipe_input_arg = self.original_input_arg
1091
1092        if hasattr(self.opts, 'lrf') and self.output_plugin.file_type == 'lrf':
1093            self.opts.lrf = True
1094        if self.input_fmt == 'azw4' and self.output_plugin.file_type == 'pdf':
1095            self.ui_reporter(0.01, 'AZW4 files are simply wrappers around PDF files.'
1096                             ' Skipping the conversion and unwrapping the embedded PDF instead')
1097            from calibre.ebooks.azw4.reader import unwrap
1098            unwrap(stream, self.output)
1099            self.ui_reporter(1.)
1100            self.log(self.output_fmt.upper(), 'output written to', self.output)
1101            self.flush()
1102            return
1103
1104        self.ui_reporter(0.01, _('Converting input to HTML...'))
1105        ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter)
1106        self.input_plugin.report_progress = ir
1107        if self.for_regex_wizard:
1108            self.input_plugin.for_viewer = True
1109        self.output_plugin.specialize_options(self.log, self.opts, self.input_fmt)
1110        with self.input_plugin:
1111            self.oeb = self.input_plugin(stream, self.opts,
1112                                        self.input_fmt, self.log,
1113                                        accelerators, tdir)
1114            if self.opts.debug_pipeline is not None:
1115                self.dump_input(self.oeb, tdir)
1116                if self.abort_after_input_dump:
1117                    return
1118            if self.input_fmt in ('recipe', 'downloaded_recipe'):
1119                self.opts_to_mi(self.user_metadata)
1120            if not hasattr(self.oeb, 'manifest'):
1121                self.oeb = create_oebbook(
1122                    self.log, self.oeb, self.opts,
1123                    encoding=self.input_plugin.output_encoding,
1124                    for_regex_wizard=self.for_regex_wizard, removed_items=getattr(self.input_plugin, 'removed_items_to_ignore', ()))
1125            if self.for_regex_wizard:
1126                return
1127            self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
1128            self.opts.is_image_collection = self.input_plugin.is_image_collection
1129            pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
1130            self.flush()
1131            if self.opts.debug_pipeline is not None:
1132                out_dir = os.path.join(self.opts.debug_pipeline, 'parsed')
1133                self.dump_oeb(self.oeb, out_dir)
1134                self.log('Parsed HTML written to:', out_dir)
1135            self.input_plugin.specialize(self.oeb, self.opts, self.log,
1136                    self.output_fmt)
1137
1138        pr(0., _('Running transforms on e-book...'))
1139
1140        self.oeb.plumber_output_format = self.output_fmt or ''
1141
1142        if self.opts.transform_html_rules:
1143            transform_html_rules = self.opts.transform_html_rules
1144            if isinstance(transform_html_rules, string_or_bytes):
1145                transform_html_rules = json.loads(transform_html_rules)
1146            from calibre.ebooks.html_transform_rules import transform_conversion_book
1147            transform_conversion_book(self.oeb, self.opts, transform_html_rules)
1148
1149        from calibre.ebooks.oeb.transforms.data_url import DataURL
1150        DataURL()(self.oeb, self.opts)
1151        from calibre.ebooks.oeb.transforms.guide import Clean
1152        Clean()(self.oeb, self.opts)
1153        pr(0.1)
1154        self.flush()
1155
1156        self.opts.source = self.opts.input_profile
1157        self.opts.dest = self.opts.output_profile
1158
1159        from calibre.ebooks.oeb.transforms.jacket import RemoveFirstImage
1160        RemoveFirstImage()(self.oeb, self.opts, self.user_metadata)
1161        from calibre.ebooks.oeb.transforms.metadata import MergeMetadata
1162        MergeMetadata()(self.oeb, self.user_metadata, self.opts,
1163                override_input_metadata=self.override_input_metadata)
1164        pr(0.2)
1165        self.flush()
1166
1167        from calibre.ebooks.oeb.transforms.structure import DetectStructure
1168        DetectStructure()(self.oeb, self.opts)
1169        pr(0.35)
1170        self.flush()
1171
1172        if self.output_plugin.file_type not in ('epub', 'kepub'):
1173            # Remove the toc reference to the html cover, if any, except for
1174            # epub, as the epub output plugin will do the right thing with it.
1175            item = getattr(self.oeb.toc, 'item_that_refers_to_cover', None)
1176            if item is not None and item.count() == 0:
1177                self.oeb.toc.remove(item)
1178
1179        from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener
1180        fbase = self.opts.base_font_size
1181        if fbase < 1e-4:
1182            fbase = float(self.opts.dest.fbase)
1183        fkey = self.opts.font_size_mapping
1184        if fkey is None:
1185            fkey = self.opts.dest.fkey
1186        else:
1187            try:
1188                fkey = list(map(float, fkey.split(',')))
1189            except Exception:
1190                self.log.error('Invalid font size key: %r ignoring'%fkey)
1191                fkey = self.opts.dest.fkey
1192
1193        from calibre.ebooks.oeb.transforms.jacket import Jacket
1194        Jacket()(self.oeb, self.opts, self.user_metadata)
1195        pr(0.4)
1196        self.flush()
1197
1198        if self.opts.debug_pipeline is not None:
1199            out_dir = os.path.join(self.opts.debug_pipeline, 'structure')
1200            self.dump_oeb(self.oeb, out_dir)
1201            self.log('Structured HTML written to:', out_dir)
1202
1203        if self.opts.extra_css and os.path.exists(self.opts.extra_css):
1204            with open(self.opts.extra_css, 'rb') as f:
1205                self.opts.extra_css = f.read()
1206
1207        oibl = self.opts.insert_blank_line
1208        orps  = self.opts.remove_paragraph_spacing
1209        if self.output_plugin.file_type == 'lrf':
1210            self.opts.insert_blank_line = False
1211            self.opts.remove_paragraph_spacing = False
1212        line_height = self.opts.line_height
1213        if line_height < 1e-4:
1214            line_height = None
1215
1216        if self.opts.linearize_tables and \
1217                self.output_plugin.file_type not in ('mobi', 'lrf'):
1218            from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables
1219            LinearizeTables()(self.oeb, self.opts)
1220
1221        if self.opts.unsmarten_punctuation:
1222            from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation
1223            UnsmartenPunctuation()(self.oeb, self.opts)
1224
1225        mobi_file_type = getattr(self.opts, 'mobi_file_type', 'old')
1226        needs_old_markup = (self.output_plugin.file_type == 'lit' or (
1227            self.output_plugin.file_type == 'mobi' and mobi_file_type == 'old'))
1228        transform_css_rules = ()
1229        if self.opts.transform_css_rules:
1230            transform_css_rules = self.opts.transform_css_rules
1231            if isinstance(transform_css_rules, string_or_bytes):
1232                transform_css_rules = json.loads(transform_css_rules)
1233        flattener = CSSFlattener(fbase=fbase, fkey=fkey,
1234                lineh=line_height,
1235                untable=needs_old_markup,
1236                unfloat=needs_old_markup,
1237                page_break_on_body=self.output_plugin.file_type in ('mobi',
1238                    'lit'),
1239                transform_css_rules=transform_css_rules,
1240                specializer=partial(self.output_plugin.specialize_css_for_output,
1241                    self.log, self.opts))
1242        flattener(self.oeb, self.opts)
1243        self.opts._final_base_font_size = fbase
1244
1245        self.opts.insert_blank_line = oibl
1246        self.opts.remove_paragraph_spacing = orps
1247
1248        from calibre.ebooks.oeb.transforms.page_margin import \
1249            RemoveFakeMargins, RemoveAdobeMargins
1250        RemoveFakeMargins()(self.oeb, self.log, self.opts)
1251        RemoveAdobeMargins()(self.oeb, self.log, self.opts)
1252
1253        if self.opts.embed_all_fonts:
1254            from calibre.ebooks.oeb.transforms.embed_fonts import EmbedFonts
1255            EmbedFonts()(self.oeb, self.log, self.opts)
1256
1257        if self.opts.subset_embedded_fonts and self.output_plugin.file_type != 'pdf':
1258            from calibre.ebooks.oeb.transforms.subset import SubsetFonts
1259            SubsetFonts()(self.oeb, self.log, self.opts)
1260
1261        pr(0.9)
1262        self.flush()
1263
1264        from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
1265
1266        self.log.info('Cleaning up manifest...')
1267        trimmer = ManifestTrimmer()
1268        trimmer(self.oeb, self.opts)
1269
1270        self.oeb.toc.rationalize_play_orders()
1271        pr(1.)
1272        self.flush()
1273
1274        if self.opts.debug_pipeline is not None:
1275            out_dir = os.path.join(self.opts.debug_pipeline, 'processed')
1276            self.dump_oeb(self.oeb, out_dir)
1277            self.log('Processed HTML written to:', out_dir)
1278
1279        self.log.info('Creating %s...'%self.output_plugin.name)
1280        our = CompositeProgressReporter(0.67, 1., self.ui_reporter)
1281        self.output_plugin.report_progress = our
1282        our(0., _('Running %s plugin')%self.output_plugin.name)
1283        with self.output_plugin:
1284            self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
1285                self.opts, self.log)
1286        self.oeb.clean_temp_files()
1287        self.ui_reporter(1.)
1288        run_plugins_on_postprocess(self.output, self.output_fmt)
1289
1290        self.log(self.output_fmt.upper(), 'output written to', self.output)
1291        self.flush()
1292
1293
1294# This has to be global as create_oebbook can be called from other locations
1295# (for example in the html input plugin)
1296regex_wizard_callback = None
1297
1298
1299def set_regex_wizard_callback(f):
1300    global regex_wizard_callback
1301    regex_wizard_callback = f
1302
1303
1304def create_oebbook(log, path_or_stream, opts, reader=None,
1305        encoding='utf-8', populate=True, for_regex_wizard=False, specialize=None, removed_items=()):
1306    '''
1307    Create an OEBBook.
1308    '''
1309    from calibre.ebooks.oeb.base import OEBBook
1310    html_preprocessor = HTMLPreProcessor(log, opts, regex_wizard_callback=regex_wizard_callback)
1311    if not encoding:
1312        encoding = None
1313    oeb = OEBBook(log, html_preprocessor,
1314            pretty_print=opts.pretty_print, input_encoding=encoding)
1315    if not populate:
1316        return oeb
1317    if specialize is not None:
1318        oeb = specialize(oeb) or oeb
1319    # Read OEB Book into OEBBook
1320    log('Parsing all content...')
1321    oeb.removed_items_to_ignore = removed_items
1322    if reader is None:
1323        from calibre.ebooks.oeb.reader import OEBReader
1324        reader = OEBReader
1325
1326    reader()(oeb, path_or_stream)
1327    return oeb
1328
1329
1330def create_dummy_plumber(input_format, output_format):
1331    from calibre.utils.logging import Log
1332    input_format = input_format.lower()
1333    output_format = output_format.lower()
1334    output_path = 'dummy.'+output_format
1335    log = Log()
1336    log.outputs = []
1337    input_file = 'dummy.'+input_format
1338    if input_format in ARCHIVE_FMTS:
1339        input_file = 'dummy.html'
1340    return Plumber(input_file, output_path, log)
1341