1# -*- coding: utf-8 -*- 2 3 4__license__ = 'GPL 3' 5__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' 6__docformat__ = 'restructuredtext en' 7 8import os, re, sys, shutil, pprint, json 9from functools import partial 10 11from calibre.customize.conversion import OptionRecommendation, DummyReporter 12from calibre.customize.ui import input_profiles, output_profiles, \ 13 plugin_for_input_format, plugin_for_output_format, \ 14 available_input_formats, available_output_formats, \ 15 run_plugins_on_preprocess, run_plugins_on_postprocess 16from calibre.ebooks.conversion.preprocess import HTMLPreProcessor 17from calibre.ptempfile import PersistentTemporaryDirectory 18from calibre.utils.date import parse_date 19from calibre.utils.zipfile import ZipFile 20from calibre import (extract, walk, isbytestring, filesystem_encoding, 21 get_types_map) 22from calibre.constants import __version__ 23from polyglot.builtins import string_or_bytes 24 25DEBUG_README=b''' 26This debug folder contains snapshots of the e-book as it passes through the 27various stages of conversion. The stages are: 28 29 1. input - This is the result of running the input plugin on the source 30 file. Use this folder to debug the input plugin. 31 32 2. parsed - This is the result of preprocessing and parsing the output of 33 the input plugin. Note that for some input plugins this will be identical to 34 the input sub-folder. Use this folder to debug structure detection, 35 etc. 36 37 3. structure - This corresponds to the stage in the pipeline when structure 38 detection has run, but before the CSS is flattened. Use this folder to 39 debug the CSS flattening, font size conversion, etc. 40 41 4. processed - This corresponds to the e-book as it is passed to the output 42 plugin. Use this folder to debug the output plugin. 43 44''' 45 46 47def supported_input_formats(): 48 fmts = available_input_formats() 49 for x in ('zip', 'rar', 'oebzip'): 50 fmts.add(x) 51 return fmts 52 53 54class OptionValues: 55 pass 56 57 58class CompositeProgressReporter: 59 60 def __init__(self, global_min, global_max, global_reporter): 61 self.global_min, self.global_max = global_min, global_max 62 self.global_reporter = global_reporter 63 64 def __call__(self, fraction, msg=''): 65 global_frac = self.global_min + fraction * \ 66 (self.global_max - self.global_min) 67 self.global_reporter(global_frac, msg) 68 69 70ARCHIVE_FMTS = ('zip', 'rar', 'oebzip') 71 72 73class Plumber: 74 75 ''' 76 The `Plumber` manages the conversion pipeline. An UI should call the methods 77 :method:`merge_ui_recommendations` and then :method:`run`. The plumber will 78 take care of the rest. 79 ''' 80 81 metadata_option_names = [ 82 'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments', 83 'publisher', 'series', 'series_index', 'rating', 'isbn', 84 'tags', 'book_producer', 'language', 'pubdate', 'timestamp' 85 ] 86 87 def __init__(self, input, output, log, report_progress=DummyReporter(), 88 dummy=False, merge_plugin_recs=True, abort_after_input_dump=False, 89 override_input_metadata=False, for_regex_wizard=False, view_kepub=False): 90 ''' 91 :param input: Path to input file. 92 :param output: Path to output file/folder 93 ''' 94 if isbytestring(input): 95 input = input.decode(filesystem_encoding) 96 if isbytestring(output): 97 output = output.decode(filesystem_encoding) 98 self.original_input_arg = input 99 self.for_regex_wizard = for_regex_wizard 100 self.input = os.path.abspath(input) 101 self.output = os.path.abspath(output) 102 self.log = log 103 self.ui_reporter = report_progress 104 self.abort_after_input_dump = abort_after_input_dump 105 self.override_input_metadata = override_input_metadata 106 107 # Pipeline options {{{ 108 # Initialize the conversion options that are independent of input and 109 # output formats. The input and output plugins can still disable these 110 # options via recommendations. 111 self.pipeline_options = [ 112 113OptionRecommendation(name='verbose', 114 recommended_value=0, level=OptionRecommendation.LOW, 115 short_switch='v', 116 help=_('Level of verbosity. Specify multiple times for greater ' 117 'verbosity. Specifying it twice will result in full ' 118 'verbosity, once medium verbosity and zero times least verbosity.') 119 ), 120 121OptionRecommendation(name='debug_pipeline', 122 recommended_value=None, level=OptionRecommendation.LOW, 123 short_switch='d', 124 help=_('Save the output from different stages of the conversion ' 125 'pipeline to the specified ' 126 'folder. Useful if you are unsure at which stage ' 127 'of the conversion process a bug is occurring.') 128 ), 129 130OptionRecommendation(name='input_profile', 131 recommended_value='default', level=OptionRecommendation.LOW, 132 choices=[x.short_name for x in input_profiles()], 133 help=_('Specify the input profile. The input profile gives the ' 134 'conversion system information on how to interpret ' 135 'various information in the input document. For ' 136 'example resolution dependent lengths (i.e. lengths in ' 137 'pixels). Choices are:') + ' ' + ', '.join([ 138 x.short_name for x in input_profiles()]) 139 ), 140 141OptionRecommendation(name='output_profile', 142 recommended_value='default', level=OptionRecommendation.LOW, 143 choices=[x.short_name for x in output_profiles()], 144 help=_('Specify the output profile. The output profile ' 145 'tells the conversion system how to optimize the ' 146 'created document for the specified device (such as by resizing images for the device screen size). In some cases, ' 147 'an output profile can be used to optimize the output for a particular device, but this is rarely necessary. ' 148 'Choices are:') + ', '.join([ 149 x.short_name for x in output_profiles()]) 150 ), 151 152OptionRecommendation(name='base_font_size', 153 recommended_value=0, level=OptionRecommendation.LOW, 154 help=_('The base font size in pts. All font sizes in the produced book ' 155 'will be rescaled based on this size. By choosing a larger ' 156 'size you can make the fonts in the output bigger and vice ' 157 'versa. By default, when the value is zero, the base font size is chosen based on ' 158 'the output profile you chose.' 159 ) 160 ), 161 162OptionRecommendation(name='font_size_mapping', 163 recommended_value=None, level=OptionRecommendation.LOW, 164 help=_('Mapping from CSS font names to font sizes in pts. ' 165 'An example setting is 12,12,14,16,18,20,22,24. ' 166 'These are the mappings for the sizes xx-small to xx-large, ' 167 'with the final size being for huge fonts. The font ' 168 'rescaling algorithm uses these sizes to intelligently ' 169 'rescale fonts. The default is to use a mapping based on ' 170 'the output profile you chose.' 171 ) 172 ), 173 174OptionRecommendation(name='disable_font_rescaling', 175 recommended_value=False, level=OptionRecommendation.LOW, 176 help=_('Disable all rescaling of font sizes.' 177 ) 178 ), 179 180OptionRecommendation(name='minimum_line_height', 181 recommended_value=120.0, level=OptionRecommendation.LOW, 182 help=_( 183 'The minimum line height, as a percentage of the element\'s ' 184 'calculated font size. calibre will ensure that every element ' 185 'has a line height of at least this setting, irrespective of ' 186 'what the input document specifies. Set to zero to disable. ' 187 'Default is 120%. Use this setting in preference to ' 188 'the direct line height specification, unless you know what ' 189 'you are doing. For example, you can achieve "double spaced" ' 190 'text by setting this to 240.' 191 ) 192 ), 193 194 195OptionRecommendation(name='line_height', 196 recommended_value=0, level=OptionRecommendation.LOW, 197 help=_( 198 'The line height in pts. Controls spacing between consecutive ' 199 'lines of text. Only applies to elements that do not define ' 200 'their own line height. In most cases, the minimum line height ' 201 'option is more useful. ' 202 'By default no line height manipulation is performed.' 203 ) 204 ), 205 206OptionRecommendation(name='embed_font_family', 207 recommended_value=None, level=OptionRecommendation.LOW, 208 help=_( 209 'Embed the specified font family into the book. This specifies ' 210 'the "base" font used for the book. If the input document ' 211 'specifies its own fonts, they may override this base font. ' 212 'You can use the filter style information option to remove fonts from the ' 213 'input document. Note that font embedding only works ' 214 'with some output formats, principally EPUB, AZW3 and DOCX.') 215 ), 216 217OptionRecommendation(name='embed_all_fonts', 218 recommended_value=False, level=OptionRecommendation.LOW, 219 help=_( 220 'Embed every font that is referenced in the input document ' 221 'but not already embedded. This will search your system for the ' 222 'fonts, and if found, they will be embedded. Embedding will only work ' 223 'if the format you are converting to supports embedded fonts, such as ' 224 'EPUB, AZW3, DOCX or PDF. Please ensure that you have the proper license for embedding ' 225 'the fonts used in this document.' 226 )), 227 228OptionRecommendation(name='subset_embedded_fonts', 229 recommended_value=False, level=OptionRecommendation.LOW, 230 help=_( 231 'Subset all embedded fonts. Every embedded font is reduced ' 232 'to contain only the glyphs used in this document. This decreases ' 233 'the size of the font files. Useful if you are embedding a ' 234 'particularly large font with lots of unused glyphs.') 235 ), 236 237OptionRecommendation(name='linearize_tables', 238 recommended_value=False, level=OptionRecommendation.LOW, 239 help=_('Some badly designed documents use tables to control the ' 240 'layout of text on the page. When converted these documents ' 241 'often have text that runs off the page and other artifacts. ' 242 'This option will extract the content from the tables and ' 243 'present it in a linear fashion.' 244 ) 245 ), 246 247OptionRecommendation(name='level1_toc', 248 recommended_value=None, level=OptionRecommendation.LOW, 249 help=_('XPath expression that specifies all tags that ' 250 'should be added to the Table of Contents at level one. If ' 251 'this is specified, it takes precedence over other forms ' 252 'of auto-detection.' 253 ' See the XPath Tutorial in the calibre User Manual for examples.' 254 ) 255 ), 256 257OptionRecommendation(name='level2_toc', 258 recommended_value=None, level=OptionRecommendation.LOW, 259 help=_('XPath expression that specifies all tags that should be ' 260 'added to the Table of Contents at level two. Each entry is added ' 261 'under the previous level one entry.' 262 ' See the XPath Tutorial in the calibre User Manual for examples.' 263 ) 264 ), 265 266OptionRecommendation(name='level3_toc', 267 recommended_value=None, level=OptionRecommendation.LOW, 268 help=_('XPath expression that specifies all tags that should be ' 269 'added to the Table of Contents at level three. Each entry ' 270 'is added under the previous level two entry.' 271 ' See the XPath Tutorial in the calibre User Manual for examples.' 272 ) 273 ), 274 275OptionRecommendation(name='use_auto_toc', 276 recommended_value=False, level=OptionRecommendation.LOW, 277 help=_('Normally, if the source file already has a Table of ' 278 'Contents, it is used in preference to the auto-generated one. ' 279 'With this option, the auto-generated one is always used.' 280 ) 281 ), 282 283OptionRecommendation(name='no_chapters_in_toc', 284 recommended_value=False, level=OptionRecommendation.LOW, 285 help=_("Don't add auto-detected chapters to the Table of " 286 'Contents.' 287 ) 288 ), 289 290OptionRecommendation(name='toc_threshold', 291 recommended_value=6, level=OptionRecommendation.LOW, 292 help=_( 293 'If fewer than this number of chapters is detected, then links ' 294 'are added to the Table of Contents. Default: %default') 295 ), 296 297OptionRecommendation(name='max_toc_links', 298 recommended_value=50, level=OptionRecommendation.LOW, 299 help=_('Maximum number of links to insert into the TOC. Set to 0 ' 300 'to disable. Default is: %default. Links are only added to the ' 301 'TOC if less than the threshold number of chapters were detected.' 302 ) 303 ), 304 305OptionRecommendation(name='toc_filter', 306 recommended_value=None, level=OptionRecommendation.LOW, 307 help=_('Remove entries from the Table of Contents whose titles ' 308 'match the specified regular expression. Matching entries and all ' 309 'their children are removed.' 310 ) 311 ), 312 313OptionRecommendation(name='duplicate_links_in_toc', 314 recommended_value=False, level=OptionRecommendation.LOW, 315 help=_('When creating a TOC from links in the input document, ' 316 'allow duplicate entries, i.e. allow more than one entry ' 317 'with the same text, provided that they point to a ' 318 'different location.') 319 ), 320 321 322OptionRecommendation(name='chapter', 323 recommended_value="//*[((name()='h1' or name()='h2') and " 324 r"re:test(., '\s*((chapter|book|section|part)\s+)|((prolog|prologue|epilogue)(\s+|$))', 'i')) or @class " 325 "= 'chapter']", level=OptionRecommendation.LOW, 326 help=_('An XPath expression to detect chapter titles. The default ' 327 'is to consider <h1> or <h2> tags that contain the words ' 328 '"chapter", "book", "section", "prologue", "epilogue" or "part" as chapter titles as ' 329 'well as any tags that have class="chapter". The expression ' 330 'used must evaluate to a list of elements. To disable chapter ' 331 'detection, use the expression "/". See the XPath Tutorial ' 332 'in the calibre User Manual for further help on using this ' 333 'feature.' 334 ) 335 ), 336 337OptionRecommendation(name='chapter_mark', 338 recommended_value='pagebreak', level=OptionRecommendation.LOW, 339 choices=['pagebreak', 'rule', 'both', 'none'], 340 help=_('Specify how to mark detected chapters. A value of ' 341 '"pagebreak" will insert page breaks before chapters. ' 342 'A value of "rule" will insert a line before chapters. ' 343 'A value of "none" will disable chapter marking and a ' 344 'value of "both" will use both page breaks and lines ' 345 'to mark chapters.') 346 ), 347 348OptionRecommendation(name='start_reading_at', 349 recommended_value=None, level=OptionRecommendation.LOW, 350 help=_('An XPath expression to detect the location in the document' 351 ' at which to start reading. Some e-book reading programs' 352 ' (most prominently the Kindle) use this location as the' 353 ' position at which to open the book. See the XPath tutorial' 354 ' in the calibre User Manual for further help using this' 355 ' feature.') 356 ), 357 358OptionRecommendation(name='extra_css', 359 recommended_value=None, level=OptionRecommendation.LOW, 360 help=_('Either the path to a CSS stylesheet or raw CSS. ' 361 'This CSS will be appended to the style rules from ' 362 'the source file, so it can be used to override those ' 363 'rules.') 364 ), 365 366OptionRecommendation(name='transform_css_rules', 367 recommended_value=None, level=OptionRecommendation.LOW, 368 help=_('Rules for transforming the styles in this book. These' 369 ' rules are applied after all other CSS processing is done.') 370 ), 371 372OptionRecommendation(name='transform_html_rules', 373 recommended_value=None, level=OptionRecommendation.LOW, 374 help=_('Rules for transforming the HTML in this book. These' 375 ' rules are applied after the HTML is parsed, but before any other transformations.') 376 ), 377 378OptionRecommendation(name='filter_css', 379 recommended_value=None, level=OptionRecommendation.LOW, 380 help=_('A comma separated list of CSS properties that ' 381 'will be removed from all CSS style rules. This is useful ' 382 'if the presence of some style information prevents it ' 383 'from being overridden on your device. ' 384 'For example: ' 385 'font-family,color,margin-left,margin-right') 386 ), 387 388OptionRecommendation(name='expand_css', 389 recommended_value=False, level=OptionRecommendation.LOW, 390 help=_( 391 'By default, calibre will use the shorthand form for various' 392 ' CSS properties such as margin, padding, border, etc. This' 393 ' option will cause it to use the full expanded form instead.' 394 ' Note that CSS is always expanded when generating EPUB files' 395 ' with the output profile set to one of the Nook profiles' 396 ' as the Nook cannot handle shorthand CSS.') 397 ), 398 399OptionRecommendation(name='page_breaks_before', 400 recommended_value="//*[name()='h1' or name()='h2']", 401 level=OptionRecommendation.LOW, 402 help=_('An XPath expression. Page breaks are inserted ' 403 'before the specified elements. To disable use the expression: /') 404 ), 405 406OptionRecommendation(name='remove_fake_margins', 407 recommended_value=True, level=OptionRecommendation.LOW, 408 help=_('Some documents specify page margins by ' 409 'specifying a left and right margin on each individual ' 410 'paragraph. calibre will try to detect and remove these ' 411 'margins. Sometimes, this can cause the removal of ' 412 'margins that should not have been removed. In this ' 413 'case you can disable the removal.') 414 ), 415 416 417OptionRecommendation(name='margin_top', 418 recommended_value=5.0, level=OptionRecommendation.LOW, 419 help=_('Set the top margin in pts. Default is %default. ' 420 'Setting this to less than zero will cause no margin to be set ' 421 '(the margin setting in the original document will be preserved). ' 422 'Note: Page oriented formats such as PDF and DOCX have their own' 423 ' margin settings that take precedence.')), 424 425OptionRecommendation(name='margin_bottom', 426 recommended_value=5.0, level=OptionRecommendation.LOW, 427 help=_('Set the bottom margin in pts. Default is %default. ' 428 'Setting this to less than zero will cause no margin to be set ' 429 '(the margin setting in the original document will be preserved). ' 430 'Note: Page oriented formats such as PDF and DOCX have their own' 431 ' margin settings that take precedence.')), 432 433OptionRecommendation(name='margin_left', 434 recommended_value=5.0, level=OptionRecommendation.LOW, 435 help=_('Set the left margin in pts. Default is %default. ' 436 'Setting this to less than zero will cause no margin to be set ' 437 '(the margin setting in the original document will be preserved). ' 438 'Note: Page oriented formats such as PDF and DOCX have their own' 439 ' margin settings that take precedence.')), 440 441OptionRecommendation(name='margin_right', 442 recommended_value=5.0, level=OptionRecommendation.LOW, 443 help=_('Set the right margin in pts. Default is %default. ' 444 'Setting this to less than zero will cause no margin to be set ' 445 '(the margin setting in the original document will be preserved). ' 446 'Note: Page oriented formats such as PDF and DOCX have their own' 447 ' margin settings that take precedence.')), 448 449OptionRecommendation(name='change_justification', 450 recommended_value='original', level=OptionRecommendation.LOW, 451 choices=['left','justify','original'], 452 help=_('Change text justification. A value of "left" converts all' 453 ' justified text in the source to left aligned (i.e. ' 454 'unjustified) text. A value of "justify" converts all ' 455 'unjustified text to justified. A value of "original" ' 456 '(the default) does not change justification in the ' 457 'source file. Note that only some output formats support ' 458 'justification.')), 459 460OptionRecommendation(name='remove_paragraph_spacing', 461 recommended_value=False, level=OptionRecommendation.LOW, 462 help=_('Remove spacing between paragraphs. Also sets an indent on ' 463 'paragraphs of 1.5em. Spacing removal will not work ' 464 'if the source file does not use paragraphs (<p> or <div> tags).') 465 ), 466 467OptionRecommendation(name='remove_paragraph_spacing_indent_size', 468 recommended_value=1.5, level=OptionRecommendation.LOW, 469 help=_('When calibre removes blank lines between paragraphs, it automatically ' 470 'sets a paragraph indent, to ensure that paragraphs can be easily ' 471 'distinguished. This option controls the width of that indent (in em). ' 472 'If you set this value negative, then the indent specified in the input ' 473 'document is used, that is, calibre does not change the indentation.') 474 ), 475 476OptionRecommendation(name='prefer_metadata_cover', 477 recommended_value=False, level=OptionRecommendation.LOW, 478 help=_('Use the cover detected from the source file in preference ' 479 'to the specified cover.') 480 ), 481 482OptionRecommendation(name='insert_blank_line', 483 recommended_value=False, level=OptionRecommendation.LOW, 484 help=_('Insert a blank line between paragraphs. Will not work ' 485 'if the source file does not use paragraphs (<p> or <div> tags).' 486 ) 487 ), 488 489OptionRecommendation(name='insert_blank_line_size', 490 recommended_value=0.5, level=OptionRecommendation.LOW, 491 help=_('Set the height of the inserted blank lines (in em).' 492 ' The height of the lines between paragraphs will be twice the value' 493 ' set here.') 494 ), 495 496OptionRecommendation(name='remove_first_image', 497 recommended_value=False, level=OptionRecommendation.LOW, 498 help=_('Remove the first image from the input e-book. Useful if the ' 499 'input document has a cover image that is not identified as a cover. ' 500 'In this case, if you set a cover in calibre, the output document will ' 501 'end up with two cover images if you do not specify this option.' 502 ) 503 ), 504 505OptionRecommendation(name='insert_metadata', 506 recommended_value=False, level=OptionRecommendation.LOW, 507 help=_('Insert the book metadata at the start of ' 508 'the book. This is useful if your e-book reader does not support ' 509 'displaying/searching metadata directly.' 510 ) 511 ), 512 513OptionRecommendation(name='smarten_punctuation', 514 recommended_value=False, level=OptionRecommendation.LOW, 515 help=_('Convert plain quotes, dashes and ellipsis to their ' 516 'typographically correct equivalents. For details, see ' 517 'https://daringfireball.net/projects/smartypants.' 518 ) 519 ), 520 521OptionRecommendation(name='unsmarten_punctuation', 522 recommended_value=False, level=OptionRecommendation.LOW, 523 help=_('Convert fancy quotes, dashes and ellipsis to their ' 524 'plain equivalents.' 525 ) 526 ), 527 528OptionRecommendation(name='read_metadata_from_opf', 529 recommended_value=None, level=OptionRecommendation.LOW, 530 short_switch='m', 531 help=_('Read metadata from the specified OPF file. Metadata read ' 532 'from this file will override any metadata in the source ' 533 'file.') 534 ), 535 536OptionRecommendation(name='asciiize', 537 recommended_value=False, level=OptionRecommendation.LOW, 538 help=(_('Transliterate Unicode characters to an ASCII ' 539 'representation. Use with care because this will replace ' 540 'Unicode characters with ASCII. For instance it will replace "{0}" ' 541 'with "{1}". Also, note that in ' 542 'cases where there are multiple representations of a character ' 543 '(characters shared by Chinese and Japanese for instance) the ' 544 'representation based on the current calibre interface language will be ' 545 'used.').format('Pelé', 'Pele')) 546 ), 547 548OptionRecommendation(name='keep_ligatures', 549 recommended_value=False, level=OptionRecommendation.LOW, 550 help=_('Preserve ligatures present in the input document. ' 551 'A ligature is a special rendering of a pair of ' 552 'characters like ff, fi, fl et cetera. ' 553 'Most readers do not have support for ' 554 'ligatures in their default fonts, so they are ' 555 'unlikely to render correctly. By default, calibre ' 556 'will turn a ligature into the corresponding pair of normal ' 557 'characters. This option will preserve them instead.') 558 ), 559 560OptionRecommendation(name='title', 561 recommended_value=None, level=OptionRecommendation.LOW, 562 help=_('Set the title.')), 563 564OptionRecommendation(name='authors', 565 recommended_value=None, level=OptionRecommendation.LOW, 566 help=_('Set the authors. Multiple authors should be separated by ' 567 'ampersands.')), 568 569OptionRecommendation(name='title_sort', 570 recommended_value=None, level=OptionRecommendation.LOW, 571 help=_('The version of the title to be used for sorting. ')), 572 573OptionRecommendation(name='author_sort', 574 recommended_value=None, level=OptionRecommendation.LOW, 575 help=_('String to be used when sorting by author. ')), 576 577OptionRecommendation(name='cover', 578 recommended_value=None, level=OptionRecommendation.LOW, 579 help=_('Set the cover to the specified file or URL')), 580 581OptionRecommendation(name='comments', 582 recommended_value=None, level=OptionRecommendation.LOW, 583 help=_('Set the e-book description.')), 584 585OptionRecommendation(name='publisher', 586 recommended_value=None, level=OptionRecommendation.LOW, 587 help=_('Set the e-book publisher.')), 588 589OptionRecommendation(name='series', 590 recommended_value=None, level=OptionRecommendation.LOW, 591 help=_('Set the series this e-book belongs to.')), 592 593OptionRecommendation(name='series_index', 594 recommended_value=None, level=OptionRecommendation.LOW, 595 help=_('Set the index of the book in this series.')), 596 597OptionRecommendation(name='rating', 598 recommended_value=None, level=OptionRecommendation.LOW, 599 help=_('Set the rating. Should be a number between 1 and 5.')), 600 601OptionRecommendation(name='isbn', 602 recommended_value=None, level=OptionRecommendation.LOW, 603 help=_('Set the ISBN of the book.')), 604 605OptionRecommendation(name='tags', 606 recommended_value=None, level=OptionRecommendation.LOW, 607 help=_('Set the tags for the book. Should be a comma separated list.')), 608 609OptionRecommendation(name='book_producer', 610 recommended_value=None, level=OptionRecommendation.LOW, 611 help=_('Set the book producer.')), 612 613OptionRecommendation(name='language', 614 recommended_value=None, level=OptionRecommendation.LOW, 615 help=_('Set the language.')), 616 617OptionRecommendation(name='pubdate', 618 recommended_value=None, level=OptionRecommendation.LOW, 619 help=_('Set the publication date (assumed to be in the local timezone, unless the timezone is explicitly specified)')), 620 621OptionRecommendation(name='timestamp', 622 recommended_value=None, level=OptionRecommendation.LOW, 623 help=_('Set the book timestamp (no longer used anywhere)')), 624 625OptionRecommendation(name='enable_heuristics', 626 recommended_value=False, level=OptionRecommendation.LOW, 627 help=_('Enable heuristic processing. This option must be set for any ' 628 'heuristic processing to take place.')), 629 630OptionRecommendation(name='markup_chapter_headings', 631 recommended_value=True, level=OptionRecommendation.LOW, 632 help=_('Detect unformatted chapter headings and sub headings. Change ' 633 'them to h2 and h3 tags. This setting will not create a TOC, ' 634 'but can be used in conjunction with structure detection to create ' 635 'one.')), 636 637OptionRecommendation(name='italicize_common_cases', 638 recommended_value=True, level=OptionRecommendation.LOW, 639 help=_('Look for common words and patterns that denote ' 640 'italics and italicize them.')), 641 642OptionRecommendation(name='fix_indents', 643 recommended_value=True, level=OptionRecommendation.LOW, 644 help=_('Turn indentation created from multiple non-breaking space entities ' 645 'into CSS indents.')), 646 647OptionRecommendation(name='html_unwrap_factor', 648 recommended_value=0.40, level=OptionRecommendation.LOW, 649 help=_('Scale used to determine the length at which a line should ' 650 'be unwrapped. Valid values are a decimal between 0 and 1. The ' 651 'default is 0.4, just below the median line length. If only a ' 652 'few lines in the document require unwrapping this value should ' 653 'be reduced')), 654 655OptionRecommendation(name='unwrap_lines', 656 recommended_value=True, level=OptionRecommendation.LOW, 657 help=_('Unwrap lines using punctuation and other formatting clues.')), 658 659OptionRecommendation(name='delete_blank_paragraphs', 660 recommended_value=True, level=OptionRecommendation.LOW, 661 help=_('Remove empty paragraphs from the document when they exist between ' 662 'every other paragraph')), 663 664OptionRecommendation(name='format_scene_breaks', 665 recommended_value=True, level=OptionRecommendation.LOW, 666 help=_('Left aligned scene break markers are center aligned. ' 667 'Replace soft scene breaks that use multiple blank lines with ' 668 'horizontal rules.')), 669 670OptionRecommendation(name='replace_scene_breaks', 671 recommended_value='', level=OptionRecommendation.LOW, 672 help=_('Replace scene breaks with the specified text. By default, the ' 673 'text from the input document is used.')), 674 675OptionRecommendation(name='dehyphenate', 676 recommended_value=True, level=OptionRecommendation.LOW, 677 help=_('Analyze hyphenated words throughout the document. The ' 678 'document itself is used as a dictionary to determine whether hyphens ' 679 'should be retained or removed.')), 680 681OptionRecommendation(name='renumber_headings', 682 recommended_value=True, level=OptionRecommendation.LOW, 683 help=_('Looks for occurrences of sequential <h1> or <h2> tags. ' 684 'The tags are renumbered to prevent splitting in the middle ' 685 'of chapter headings.')), 686 687OptionRecommendation(name='sr1_search', 688 recommended_value='', level=OptionRecommendation.LOW, 689 help=_('Search pattern (regular expression) to be replaced with ' 690 'sr1-replace.')), 691 692OptionRecommendation(name='sr1_replace', 693 recommended_value='', level=OptionRecommendation.LOW, 694 help=_('Replacement to replace the text found with sr1-search.')), 695 696OptionRecommendation(name='sr2_search', 697 recommended_value='', level=OptionRecommendation.LOW, 698 help=_('Search pattern (regular expression) to be replaced with ' 699 'sr2-replace.')), 700 701OptionRecommendation(name='sr2_replace', 702 recommended_value='', level=OptionRecommendation.LOW, 703 help=_('Replacement to replace the text found with sr2-search.')), 704 705OptionRecommendation(name='sr3_search', 706 recommended_value='', level=OptionRecommendation.LOW, 707 help=_('Search pattern (regular expression) to be replaced with ' 708 'sr3-replace.')), 709 710OptionRecommendation(name='sr3_replace', 711 recommended_value='', level=OptionRecommendation.LOW, 712 help=_('Replacement to replace the text found with sr3-search.')), 713 714OptionRecommendation(name='search_replace', 715 recommended_value=None, level=OptionRecommendation.LOW, help=_( 716 'Path to a file containing search and replace regular expressions. ' 717 'The file must contain alternating lines of regular expression ' 718 'followed by replacement pattern (which can be an empty line). ' 719 'The regular expression must be in the Python regex syntax and ' 720 'the file must be UTF-8 encoded.')), 721] 722 # }}} 723 724 input_fmt = os.path.splitext(self.input)[1] 725 if not input_fmt: 726 raise ValueError('Input file must have an extension') 727 input_fmt = input_fmt[1:].lower().replace('original_', '') 728 if view_kepub and input_fmt.lower() == 'kepub': 729 input_fmt = 'epub' 730 self.archive_input_tdir = None 731 self.changed_options = set() 732 if input_fmt in ARCHIVE_FMTS: 733 self.log('Processing archive...') 734 tdir = PersistentTemporaryDirectory('_pl_arc') 735 self.input, input_fmt = self.unarchive(self.input, tdir) 736 self.archive_input_tdir = tdir 737 if os.access(self.input, os.R_OK): 738 nfp = run_plugins_on_preprocess(self.input, input_fmt) 739 if nfp != self.input: 740 self.input = nfp 741 input_fmt = os.path.splitext(self.input)[1] 742 if not input_fmt: 743 raise ValueError('Input file must have an extension') 744 input_fmt = input_fmt[1:].lower() 745 746 if os.path.exists(self.output) and os.path.isdir(self.output): 747 output_fmt = 'oeb' 748 else: 749 output_fmt = os.path.splitext(self.output)[1] 750 if not output_fmt: 751 output_fmt = '.oeb' 752 output_fmt = output_fmt[1:].lower() 753 754 self.input_plugin = plugin_for_input_format(input_fmt) 755 self.output_plugin = plugin_for_output_format(output_fmt) 756 757 if self.input_plugin is None: 758 raise ValueError('No plugin to handle input format: '+input_fmt) 759 760 if self.output_plugin is None: 761 raise ValueError('No plugin to handle output format: '+output_fmt) 762 763 self.input_fmt = input_fmt 764 self.output_fmt = output_fmt 765 766 self.all_format_options = set() 767 self.input_options = set() 768 self.output_options = set() 769 # Build set of all possible options. Two options are equal if their 770 # names are the same. 771 if not dummy: 772 self.input_options = self.input_plugin.options.union( 773 self.input_plugin.common_options) 774 self.output_options = self.output_plugin.options.union( 775 self.output_plugin.common_options) 776 else: 777 for fmt in available_input_formats(): 778 input_plugin = plugin_for_input_format(fmt) 779 if input_plugin: 780 self.all_format_options = self.all_format_options.union( 781 input_plugin.options.union(input_plugin.common_options)) 782 for fmt in available_output_formats(): 783 output_plugin = plugin_for_output_format(fmt) 784 if output_plugin: 785 self.all_format_options = self.all_format_options.union( 786 output_plugin.options.union(output_plugin.common_options)) 787 788 # Remove the options that have been disabled by recommendations from the 789 # plugins. 790 for w in ('input_options', 'output_options', 791 'all_format_options'): 792 temp = set() 793 for x in getattr(self, w): 794 temp.add(x.clone()) 795 setattr(self, w, temp) 796 if merge_plugin_recs: 797 self.merge_plugin_recommendations() 798 799 @classmethod 800 def unarchive(self, path, tdir): 801 extract(path, tdir) 802 files = list(walk(tdir)) 803 files = [f if isinstance(f, str) else f.decode(filesystem_encoding) 804 for f in files] 805 from calibre.customize.ui import available_input_formats 806 fmts = set(available_input_formats()) 807 fmts -= {'htm', 'html', 'xhtm', 'xhtml'} 808 fmts -= set(ARCHIVE_FMTS) 809 810 for ext in fmts: 811 for f in files: 812 if f.lower().endswith('.'+ext): 813 if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048: 814 continue 815 return f, ext 816 return self.find_html_index(files) 817 818 @classmethod 819 def find_html_index(self, files): 820 ''' 821 Given a list of files, find the most likely root HTML file in the 822 list. 823 ''' 824 html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE) 825 html_files = [f for f in files if html_pat.search(f) is not None] 826 if not html_files: 827 raise ValueError(_('Could not find an e-book inside the archive')) 828 html_files = [(f, os.stat(f).st_size) for f in html_files] 829 html_files.sort(key=lambda x: x[1]) 830 html_files = [f[0] for f in html_files] 831 for q in ('toc', 'index'): 832 for f in html_files: 833 if os.path.splitext(os.path.basename(f))[0].lower() == q: 834 return f, os.path.splitext(f)[1].lower()[1:] 835 return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:] 836 837 def get_all_options(self): 838 ans = {} 839 for group in (self.input_options, self.pipeline_options, 840 self.output_options, self.all_format_options): 841 for rec in group: 842 ans[rec.option] = rec.recommended_value 843 return ans 844 845 def get_option_by_name(self, name): 846 for group in (self.input_options, self.pipeline_options, 847 self.output_options, self.all_format_options): 848 for rec in group: 849 if rec.option == name: 850 return rec 851 852 def get_option_help(self, name): 853 rec = self.get_option_by_name(name) 854 help = getattr(rec, 'help', None) 855 if help is not None: 856 return help.replace('%default', str(rec.recommended_value)) 857 858 def get_all_help(self): 859 ans = {} 860 for group in (self.input_options, self.pipeline_options, 861 self.output_options, self.all_format_options): 862 for rec in group: 863 help = getattr(rec, 'help', None) 864 if help is not None: 865 ans[rec.option.name] = help 866 return ans 867 868 def merge_plugin_recs(self, plugin): 869 for name, val, level in plugin.recommendations: 870 rec = self.get_option_by_name(name) 871 if rec is not None and rec.level <= level: 872 rec.recommended_value = val 873 rec.level = level 874 875 def merge_plugin_recommendations(self): 876 for source in (self.input_plugin, self.output_plugin): 877 self.merge_plugin_recs(source) 878 879 def merge_ui_recommendations(self, recommendations): 880 ''' 881 Merge recommendations from the UI. As long as the UI recommendation 882 level is >= the baseline recommended level, the UI value is used, 883 *except* if the baseline has a recommendation level of `HIGH`. 884 ''' 885 886 def eq(name, a, b): 887 if name in {'sr1_search', 'sr1_replace', 'sr2_search', 'sr2_replace', 'sr3_search', 'sr3_replace', 'filter_css', 'comments'}: 888 if not a and not b: 889 return True 890 if name in {'transform_css_rules', 'transform_html_rules', 'search_replace'}: 891 if b == '[]': 892 b = None 893 return a == b 894 895 for name, val, level in recommendations: 896 rec = self.get_option_by_name(name) 897 if rec is not None and rec.level <= level and rec.level < rec.HIGH: 898 changed = not eq(name, rec.recommended_value, val) 899 rec.recommended_value = val 900 rec.level = level 901 if changed: 902 self.changed_options.add(rec) 903 904 def opts_to_mi(self, mi): 905 from calibre.ebooks.metadata import string_to_authors 906 for x in self.metadata_option_names: 907 val = getattr(self.opts, x, None) 908 if val is not None: 909 if x == 'authors': 910 val = string_to_authors(val) 911 elif x == 'tags': 912 val = [i.strip() for i in val.split(',')] 913 elif x in ('rating', 'series_index'): 914 try: 915 val = float(val) 916 except ValueError: 917 self.log.warn(_('Values of series index and rating must' 918 ' be numbers. Ignoring'), val) 919 continue 920 elif x in ('timestamp', 'pubdate'): 921 try: 922 val = parse_date(val, assume_utc=x=='timestamp') 923 except: 924 self.log.exception(_('Failed to parse date/time') + ' ' + str(val)) 925 continue 926 setattr(mi, x, val) 927 928 def download_cover(self, url): 929 from calibre import browser 930 from PIL import Image 931 import io 932 from calibre.ptempfile import PersistentTemporaryFile 933 self.log('Downloading cover from %r'%url) 934 br = browser() 935 raw = br.open_novisit(url).read() 936 buf = io.BytesIO(raw) 937 pt = PersistentTemporaryFile('.jpg') 938 pt.close() 939 img = Image.open(buf) 940 img.convert('RGB').save(pt.name) 941 return pt.name 942 943 def read_user_metadata(self): 944 ''' 945 Read all metadata specified by the user. Command line options override 946 metadata from a specified OPF file. 947 ''' 948 from calibre.ebooks.metadata import MetaInformation 949 from calibre.ebooks.metadata.opf2 import OPF 950 mi = MetaInformation(None, []) 951 if self.opts.read_metadata_from_opf is not None: 952 self.opts.read_metadata_from_opf = os.path.abspath( 953 self.opts.read_metadata_from_opf) 954 with lopen(self.opts.read_metadata_from_opf, 'rb') as stream: 955 opf = OPF(stream, os.path.dirname(self.opts.read_metadata_from_opf)) 956 mi = opf.to_book_metadata() 957 self.opts_to_mi(mi) 958 if mi.cover: 959 if mi.cover.startswith('http:') or mi.cover.startswith('https:'): 960 mi.cover = self.download_cover(mi.cover) 961 ext = mi.cover.rpartition('.')[-1].lower().strip() 962 if ext not in ('png', 'jpg', 'jpeg', 'gif'): 963 ext = 'jpg' 964 with lopen(mi.cover, 'rb') as stream: 965 mi.cover_data = (ext, stream.read()) 966 mi.cover = None 967 self.user_metadata = mi 968 969 def setup_options(self): 970 ''' 971 Setup the `self.opts` object. 972 ''' 973 self.opts = OptionValues() 974 for group in (self.input_options, self.pipeline_options, 975 self.output_options, self.all_format_options): 976 for rec in group: 977 setattr(self.opts, rec.option.name, rec.recommended_value) 978 979 def set_profile(profiles, which): 980 attr = which + '_profile' 981 sval = getattr(self.opts, attr) 982 for x in profiles(): 983 if x.short_name == sval: 984 setattr(self.opts, attr, x) 985 return 986 self.log.warn( 987 'Profile (%s) %r is no longer available, using default'%(which, sval)) 988 for x in profiles(): 989 if x.short_name == 'default': 990 setattr(self.opts, attr, x) 991 break 992 993 set_profile(input_profiles, 'input') 994 set_profile(output_profiles, 'output') 995 996 self.read_user_metadata() 997 self.opts.no_inline_navbars = self.opts.output_profile.supports_mobi_indexing \ 998 and self.output_fmt == 'mobi' 999 if self.opts.verbose: 1000 self.log.filter_level = self.log.DEBUG 1001 if self.changed_options: 1002 self.log('Conversion options changed from defaults:') 1003 for rec in self.changed_options: 1004 if rec.option.name not in ('username', 'password'): 1005 self.log(' ', '%s:' % rec.option.name, repr(rec.recommended_value)) 1006 if self.opts.verbose > 1: 1007 self.log.debug('Resolved conversion options') 1008 try: 1009 self.log.debug('calibre version:', __version__) 1010 odict = dict(self.opts.__dict__) 1011 for x in ('username', 'password'): 1012 odict.pop(x, None) 1013 self.log.debug(pprint.pformat(odict)) 1014 except: 1015 self.log.exception('Failed to get resolved conversion options') 1016 1017 def flush(self): 1018 try: 1019 sys.stdout.flush() 1020 sys.stderr.flush() 1021 except Exception: 1022 pass 1023 1024 def dump_oeb(self, oeb, out_dir): 1025 from calibre.ebooks.oeb.writer import OEBWriter 1026 w = OEBWriter(pretty_print=self.opts.pretty_print) 1027 w(oeb, out_dir) 1028 1029 def dump_input(self, ret, output_dir): 1030 out_dir = os.path.join(self.opts.debug_pipeline, 'input') 1031 if isinstance(ret, string_or_bytes): 1032 shutil.copytree(output_dir, out_dir) 1033 else: 1034 if not os.path.exists(out_dir): 1035 os.makedirs(out_dir) 1036 self.dump_oeb(ret, out_dir) 1037 if self.input_fmt == 'recipe': 1038 zf = ZipFile(os.path.join(self.opts.debug_pipeline, 1039 'periodical.downloaded_recipe'), 'w') 1040 zf.add_dir(out_dir) 1041 with self.input_plugin: 1042 self.input_plugin.save_download(zf) 1043 zf.close() 1044 1045 self.log.info('Input debug saved to:', out_dir) 1046 1047 def run(self): 1048 ''' 1049 Run the conversion pipeline 1050 ''' 1051 # Setup baseline option values 1052 self.setup_options() 1053 if self.opts.verbose: 1054 self.log.filter_level = self.log.DEBUG 1055 if self.for_regex_wizard and hasattr(self.opts, 'no_process'): 1056 self.opts.no_process = True 1057 self.flush() 1058 if self.opts.embed_all_fonts or self.opts.embed_font_family: 1059 # Start the threaded font scanner now, for performance 1060 from calibre.utils.fonts.scanner import font_scanner # noqa 1061 import css_parser, logging 1062 css_parser.log.setLevel(logging.WARN) 1063 get_types_map() # Ensure the mimetypes module is initialized 1064 1065 if self.opts.debug_pipeline is not None: 1066 self.opts.verbose = max(self.opts.verbose, 4) 1067 self.opts.debug_pipeline = os.path.abspath(self.opts.debug_pipeline) 1068 if not os.path.exists(self.opts.debug_pipeline): 1069 os.makedirs(self.opts.debug_pipeline) 1070 with lopen(os.path.join(self.opts.debug_pipeline, 'README.txt'), 'wb') as f: 1071 f.write(DEBUG_README) 1072 for x in ('input', 'parsed', 'structure', 'processed'): 1073 x = os.path.join(self.opts.debug_pipeline, x) 1074 if os.path.exists(x): 1075 shutil.rmtree(x) 1076 1077 # Run any preprocess plugins 1078 from calibre.customize.ui import run_plugins_on_preprocess 1079 self.input = run_plugins_on_preprocess(self.input) 1080 1081 self.flush() 1082 # Create an OEBBook from the input file. The input plugin does all the 1083 # heavy lifting. 1084 accelerators = {} 1085 1086 tdir = PersistentTemporaryDirectory('_plumber') 1087 stream = self.input if self.input_fmt == 'recipe' else \ 1088 lopen(self.input, 'rb') 1089 if self.input_fmt == 'recipe': 1090 self.opts.original_recipe_input_arg = self.original_input_arg 1091 1092 if hasattr(self.opts, 'lrf') and self.output_plugin.file_type == 'lrf': 1093 self.opts.lrf = True 1094 if self.input_fmt == 'azw4' and self.output_plugin.file_type == 'pdf': 1095 self.ui_reporter(0.01, 'AZW4 files are simply wrappers around PDF files.' 1096 ' Skipping the conversion and unwrapping the embedded PDF instead') 1097 from calibre.ebooks.azw4.reader import unwrap 1098 unwrap(stream, self.output) 1099 self.ui_reporter(1.) 1100 self.log(self.output_fmt.upper(), 'output written to', self.output) 1101 self.flush() 1102 return 1103 1104 self.ui_reporter(0.01, _('Converting input to HTML...')) 1105 ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter) 1106 self.input_plugin.report_progress = ir 1107 if self.for_regex_wizard: 1108 self.input_plugin.for_viewer = True 1109 self.output_plugin.specialize_options(self.log, self.opts, self.input_fmt) 1110 with self.input_plugin: 1111 self.oeb = self.input_plugin(stream, self.opts, 1112 self.input_fmt, self.log, 1113 accelerators, tdir) 1114 if self.opts.debug_pipeline is not None: 1115 self.dump_input(self.oeb, tdir) 1116 if self.abort_after_input_dump: 1117 return 1118 if self.input_fmt in ('recipe', 'downloaded_recipe'): 1119 self.opts_to_mi(self.user_metadata) 1120 if not hasattr(self.oeb, 'manifest'): 1121 self.oeb = create_oebbook( 1122 self.log, self.oeb, self.opts, 1123 encoding=self.input_plugin.output_encoding, 1124 for_regex_wizard=self.for_regex_wizard, removed_items=getattr(self.input_plugin, 'removed_items_to_ignore', ())) 1125 if self.for_regex_wizard: 1126 return 1127 self.input_plugin.postprocess_book(self.oeb, self.opts, self.log) 1128 self.opts.is_image_collection = self.input_plugin.is_image_collection 1129 pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter) 1130 self.flush() 1131 if self.opts.debug_pipeline is not None: 1132 out_dir = os.path.join(self.opts.debug_pipeline, 'parsed') 1133 self.dump_oeb(self.oeb, out_dir) 1134 self.log('Parsed HTML written to:', out_dir) 1135 self.input_plugin.specialize(self.oeb, self.opts, self.log, 1136 self.output_fmt) 1137 1138 pr(0., _('Running transforms on e-book...')) 1139 1140 self.oeb.plumber_output_format = self.output_fmt or '' 1141 1142 if self.opts.transform_html_rules: 1143 transform_html_rules = self.opts.transform_html_rules 1144 if isinstance(transform_html_rules, string_or_bytes): 1145 transform_html_rules = json.loads(transform_html_rules) 1146 from calibre.ebooks.html_transform_rules import transform_conversion_book 1147 transform_conversion_book(self.oeb, self.opts, transform_html_rules) 1148 1149 from calibre.ebooks.oeb.transforms.data_url import DataURL 1150 DataURL()(self.oeb, self.opts) 1151 from calibre.ebooks.oeb.transforms.guide import Clean 1152 Clean()(self.oeb, self.opts) 1153 pr(0.1) 1154 self.flush() 1155 1156 self.opts.source = self.opts.input_profile 1157 self.opts.dest = self.opts.output_profile 1158 1159 from calibre.ebooks.oeb.transforms.jacket import RemoveFirstImage 1160 RemoveFirstImage()(self.oeb, self.opts, self.user_metadata) 1161 from calibre.ebooks.oeb.transforms.metadata import MergeMetadata 1162 MergeMetadata()(self.oeb, self.user_metadata, self.opts, 1163 override_input_metadata=self.override_input_metadata) 1164 pr(0.2) 1165 self.flush() 1166 1167 from calibre.ebooks.oeb.transforms.structure import DetectStructure 1168 DetectStructure()(self.oeb, self.opts) 1169 pr(0.35) 1170 self.flush() 1171 1172 if self.output_plugin.file_type not in ('epub', 'kepub'): 1173 # Remove the toc reference to the html cover, if any, except for 1174 # epub, as the epub output plugin will do the right thing with it. 1175 item = getattr(self.oeb.toc, 'item_that_refers_to_cover', None) 1176 if item is not None and item.count() == 0: 1177 self.oeb.toc.remove(item) 1178 1179 from calibre.ebooks.oeb.transforms.flatcss import CSSFlattener 1180 fbase = self.opts.base_font_size 1181 if fbase < 1e-4: 1182 fbase = float(self.opts.dest.fbase) 1183 fkey = self.opts.font_size_mapping 1184 if fkey is None: 1185 fkey = self.opts.dest.fkey 1186 else: 1187 try: 1188 fkey = list(map(float, fkey.split(','))) 1189 except Exception: 1190 self.log.error('Invalid font size key: %r ignoring'%fkey) 1191 fkey = self.opts.dest.fkey 1192 1193 from calibre.ebooks.oeb.transforms.jacket import Jacket 1194 Jacket()(self.oeb, self.opts, self.user_metadata) 1195 pr(0.4) 1196 self.flush() 1197 1198 if self.opts.debug_pipeline is not None: 1199 out_dir = os.path.join(self.opts.debug_pipeline, 'structure') 1200 self.dump_oeb(self.oeb, out_dir) 1201 self.log('Structured HTML written to:', out_dir) 1202 1203 if self.opts.extra_css and os.path.exists(self.opts.extra_css): 1204 with open(self.opts.extra_css, 'rb') as f: 1205 self.opts.extra_css = f.read() 1206 1207 oibl = self.opts.insert_blank_line 1208 orps = self.opts.remove_paragraph_spacing 1209 if self.output_plugin.file_type == 'lrf': 1210 self.opts.insert_blank_line = False 1211 self.opts.remove_paragraph_spacing = False 1212 line_height = self.opts.line_height 1213 if line_height < 1e-4: 1214 line_height = None 1215 1216 if self.opts.linearize_tables and \ 1217 self.output_plugin.file_type not in ('mobi', 'lrf'): 1218 from calibre.ebooks.oeb.transforms.linearize_tables import LinearizeTables 1219 LinearizeTables()(self.oeb, self.opts) 1220 1221 if self.opts.unsmarten_punctuation: 1222 from calibre.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation 1223 UnsmartenPunctuation()(self.oeb, self.opts) 1224 1225 mobi_file_type = getattr(self.opts, 'mobi_file_type', 'old') 1226 needs_old_markup = (self.output_plugin.file_type == 'lit' or ( 1227 self.output_plugin.file_type == 'mobi' and mobi_file_type == 'old')) 1228 transform_css_rules = () 1229 if self.opts.transform_css_rules: 1230 transform_css_rules = self.opts.transform_css_rules 1231 if isinstance(transform_css_rules, string_or_bytes): 1232 transform_css_rules = json.loads(transform_css_rules) 1233 flattener = CSSFlattener(fbase=fbase, fkey=fkey, 1234 lineh=line_height, 1235 untable=needs_old_markup, 1236 unfloat=needs_old_markup, 1237 page_break_on_body=self.output_plugin.file_type in ('mobi', 1238 'lit'), 1239 transform_css_rules=transform_css_rules, 1240 specializer=partial(self.output_plugin.specialize_css_for_output, 1241 self.log, self.opts)) 1242 flattener(self.oeb, self.opts) 1243 self.opts._final_base_font_size = fbase 1244 1245 self.opts.insert_blank_line = oibl 1246 self.opts.remove_paragraph_spacing = orps 1247 1248 from calibre.ebooks.oeb.transforms.page_margin import \ 1249 RemoveFakeMargins, RemoveAdobeMargins 1250 RemoveFakeMargins()(self.oeb, self.log, self.opts) 1251 RemoveAdobeMargins()(self.oeb, self.log, self.opts) 1252 1253 if self.opts.embed_all_fonts: 1254 from calibre.ebooks.oeb.transforms.embed_fonts import EmbedFonts 1255 EmbedFonts()(self.oeb, self.log, self.opts) 1256 1257 if self.opts.subset_embedded_fonts and self.output_plugin.file_type != 'pdf': 1258 from calibre.ebooks.oeb.transforms.subset import SubsetFonts 1259 SubsetFonts()(self.oeb, self.log, self.opts) 1260 1261 pr(0.9) 1262 self.flush() 1263 1264 from calibre.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer 1265 1266 self.log.info('Cleaning up manifest...') 1267 trimmer = ManifestTrimmer() 1268 trimmer(self.oeb, self.opts) 1269 1270 self.oeb.toc.rationalize_play_orders() 1271 pr(1.) 1272 self.flush() 1273 1274 if self.opts.debug_pipeline is not None: 1275 out_dir = os.path.join(self.opts.debug_pipeline, 'processed') 1276 self.dump_oeb(self.oeb, out_dir) 1277 self.log('Processed HTML written to:', out_dir) 1278 1279 self.log.info('Creating %s...'%self.output_plugin.name) 1280 our = CompositeProgressReporter(0.67, 1., self.ui_reporter) 1281 self.output_plugin.report_progress = our 1282 our(0., _('Running %s plugin')%self.output_plugin.name) 1283 with self.output_plugin: 1284 self.output_plugin.convert(self.oeb, self.output, self.input_plugin, 1285 self.opts, self.log) 1286 self.oeb.clean_temp_files() 1287 self.ui_reporter(1.) 1288 run_plugins_on_postprocess(self.output, self.output_fmt) 1289 1290 self.log(self.output_fmt.upper(), 'output written to', self.output) 1291 self.flush() 1292 1293 1294# This has to be global as create_oebbook can be called from other locations 1295# (for example in the html input plugin) 1296regex_wizard_callback = None 1297 1298 1299def set_regex_wizard_callback(f): 1300 global regex_wizard_callback 1301 regex_wizard_callback = f 1302 1303 1304def create_oebbook(log, path_or_stream, opts, reader=None, 1305 encoding='utf-8', populate=True, for_regex_wizard=False, specialize=None, removed_items=()): 1306 ''' 1307 Create an OEBBook. 1308 ''' 1309 from calibre.ebooks.oeb.base import OEBBook 1310 html_preprocessor = HTMLPreProcessor(log, opts, regex_wizard_callback=regex_wizard_callback) 1311 if not encoding: 1312 encoding = None 1313 oeb = OEBBook(log, html_preprocessor, 1314 pretty_print=opts.pretty_print, input_encoding=encoding) 1315 if not populate: 1316 return oeb 1317 if specialize is not None: 1318 oeb = specialize(oeb) or oeb 1319 # Read OEB Book into OEBBook 1320 log('Parsing all content...') 1321 oeb.removed_items_to_ignore = removed_items 1322 if reader is None: 1323 from calibre.ebooks.oeb.reader import OEBReader 1324 reader = OEBReader 1325 1326 reader()(oeb, path_or_stream) 1327 return oeb 1328 1329 1330def create_dummy_plumber(input_format, output_format): 1331 from calibre.utils.logging import Log 1332 input_format = input_format.lower() 1333 output_format = output_format.lower() 1334 output_path = 'dummy.'+output_format 1335 log = Log() 1336 log.outputs = [] 1337 input_file = 'dummy.'+input_format 1338 if input_format in ARCHIVE_FMTS: 1339 input_file = 'dummy.html' 1340 return Plumber(input_file, output_path, log) 1341