1# -*- coding: utf-8 -*-
2
3
4__license__ = 'GPL 3'
5__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
6__docformat__ = 'restructuredtext en'
7
8import os
9from io import BytesIO
10
11from calibre.customize.conversion import InputFormatPlugin
12
13
14class DJVUInput(InputFormatPlugin):
15
16    name        = 'DJVU Input'
17    author      = 'Anthon van der Neut'
18    description = _('Convert OCR-ed DJVU files (.djvu) to HTML')
19    file_types  = {'djvu', 'djv'}
20    commit_name = 'djvu_input'
21
22    def convert(self, stream, options, file_ext, log, accelerators):
23        from calibre.ebooks.txt.processor import convert_basic
24
25        stdout = BytesIO()
26        from calibre.ebooks.djvu.djvu import DJVUFile
27        x = DJVUFile(stream)
28        x.get_text(stdout)
29        raw_text = stdout.getvalue()
30        if not raw_text:
31            raise ValueError('The DJVU file contains no text, only images, probably page scans.'
32                    ' calibre only supports conversion of DJVU files with actual text in them.')
33
34        html = convert_basic(raw_text.replace(b"\n", b' ').replace(
35            b'\037', b'\n\n'))
36        # Run the HTMLized text through the html processing plugin.
37        from calibre.customize.ui import plugin_for_input_format
38        html_input = plugin_for_input_format('html')
39        for opt in html_input.options:
40            setattr(options, opt.option.name, opt.recommended_value)
41        options.input_encoding = 'utf-8'
42        base = os.getcwd()
43        htmlfile = os.path.join(base, 'index.html')
44        c = 0
45        while os.path.exists(htmlfile):
46            c += 1
47            htmlfile = os.path.join(base, 'index%d.html'%c)
48        with open(htmlfile, 'wb') as f:
49            f.write(html.encode('utf-8'))
50        odi = options.debug_pipeline
51        options.debug_pipeline = None
52        # Generate oeb from html conversion.
53        with open(htmlfile, 'rb') as f:
54            oeb = html_input.convert(f, options, 'html', log,
55                {})
56        options.debug_pipeline = odi
57        os.remove(htmlfile)
58
59        # Set metadata from file.
60        from calibre.customize.ui import get_file_type_metadata
61        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
62        mi = get_file_type_metadata(stream, file_ext)
63        meta_info_to_oeb_metadata(mi, oeb.metadata, log)
64
65        return oeb
66