1# -*- coding: utf-8 -*- 2 3 4__license__ = 'GPL 3' 5__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>' 6__docformat__ = 'restructuredtext en' 7 8import os 9from io import BytesIO 10 11from calibre.customize.conversion import InputFormatPlugin 12 13 14class DJVUInput(InputFormatPlugin): 15 16 name = 'DJVU Input' 17 author = 'Anthon van der Neut' 18 description = _('Convert OCR-ed DJVU files (.djvu) to HTML') 19 file_types = {'djvu', 'djv'} 20 commit_name = 'djvu_input' 21 22 def convert(self, stream, options, file_ext, log, accelerators): 23 from calibre.ebooks.txt.processor import convert_basic 24 25 stdout = BytesIO() 26 from calibre.ebooks.djvu.djvu import DJVUFile 27 x = DJVUFile(stream) 28 x.get_text(stdout) 29 raw_text = stdout.getvalue() 30 if not raw_text: 31 raise ValueError('The DJVU file contains no text, only images, probably page scans.' 32 ' calibre only supports conversion of DJVU files with actual text in them.') 33 34 html = convert_basic(raw_text.replace(b"\n", b' ').replace( 35 b'\037', b'\n\n')) 36 # Run the HTMLized text through the html processing plugin. 37 from calibre.customize.ui import plugin_for_input_format 38 html_input = plugin_for_input_format('html') 39 for opt in html_input.options: 40 setattr(options, opt.option.name, opt.recommended_value) 41 options.input_encoding = 'utf-8' 42 base = os.getcwd() 43 htmlfile = os.path.join(base, 'index.html') 44 c = 0 45 while os.path.exists(htmlfile): 46 c += 1 47 htmlfile = os.path.join(base, 'index%d.html'%c) 48 with open(htmlfile, 'wb') as f: 49 f.write(html.encode('utf-8')) 50 odi = options.debug_pipeline 51 options.debug_pipeline = None 52 # Generate oeb from html conversion. 53 with open(htmlfile, 'rb') as f: 54 oeb = html_input.convert(f, options, 'html', log, 55 {}) 56 options.debug_pipeline = odi 57 os.remove(htmlfile) 58 59 # Set metadata from file. 60 from calibre.customize.ui import get_file_type_metadata 61 from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata 62 mi = get_file_type_metadata(stream, file_ext) 63 meta_info_to_oeb_metadata(mi, oeb.metadata, log) 64 65 return oeb 66