1# vim:fileencoding=utf-8
2# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
3
4
5import copy
6import glob
7import os
8import re
9import sys
10import tempfile
11from collections import deque
12from functools import partial
13from itertools import chain
14from math import ceil, floor
15
16from calibre import (
17    __appname__, entity_to_unicode, fit_image, force_unicode, preferred_encoding
18)
19from calibre.constants import filesystem_encoding
20from calibre.devices.interface import DevicePlugin as Device
21from calibre.ebooks import ConversionError
22from calibre.ebooks.BeautifulSoup import (
23    BeautifulSoup, Comment, Declaration, NavigableString, ProcessingInstruction, Tag
24)
25from calibre.ebooks.chardet import xml_to_unicode
26from calibre.ebooks.lrf import Book
27from calibre.ebooks.lrf.html.color_map import lrs_color
28from calibre.ebooks.lrf.html.table import Table
29from calibre.ebooks.lrf.pylrs.pylrs import (
30    CR, BlockSpace, BookSetting, Canvas, CharButton, DropCaps, EmpLine, Image,
31    ImageBlock, ImageStream, Italic, JumpButton, LrsError, Paragraph, Plot,
32    RuledLine, Span, Sub, Sup, TextBlock
33)
34from calibre.ptempfile import PersistentTemporaryFile
35from polyglot.builtins import itervalues, string_or_bytes
36from polyglot.urllib import unquote, urlparse
37
38"""
39Code to convert HTML ebooks into LRF ebooks.
40
41I am indebted to esperanc for the initial CSS->Xylog Style conversion code
42and to Falstaff for pylrs.
43"""
44
45from PIL import Image as PILImage
46
47
48def update_css(ncss, ocss):
49    for key in ncss.keys():
50        if key in ocss:
51            ocss[key].update(ncss[key])
52        else:
53            ocss[key] = ncss[key]
54
55
56def munge_paths(basepath, url):
57    purl = urlparse(unquote(url),)
58    path, fragment = purl[2], purl[5]
59    if path:
60        path = path.replace('/', os.sep)
61    if not path:
62        path = basepath
63    elif not os.path.isabs(path):
64        dn = os.path.dirname(basepath)
65        path = os.path.join(dn, path)
66    return os.path.normpath(path), fragment
67
68
69def strip_style_comments(match):
70    src = match.group()
71    while True:
72        lindex = src.find('/*')
73        if lindex < 0:
74            break
75        rindex = src.find('*/', lindex)
76        if rindex < 0:
77            src = src[:lindex]
78            break
79        src = src[:lindex] + src[rindex+2:]
80    return src
81
82
83def tag_regex(tagname):
84    '''Return non-grouping regular expressions that match the opening and closing tags for tagname'''
85    return dict(open=r'(?:<\s*%(t)s\s+[^<>]*?>|<\s*%(t)s\s*>)'%dict(t=tagname),
86                close=r'</\s*%(t)s\s*>'%dict(t=tagname))
87
88
89class HTMLConverter:
90    SELECTOR_PAT   = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
91    PAGE_BREAK_PAT = re.compile(r'page-break-(?:after|before)\s*:\s*(\w+)', re.IGNORECASE)
92    IGNORED_TAGS   = (Comment, Declaration, ProcessingInstruction)
93
94    MARKUP_MASSAGE   = [
95                        # Close <a /> tags
96                        (re.compile(r'<a(\s[^>]*)?/>', re.IGNORECASE),
97                         lambda match: '<a'+match.group(1)+'></a>'),
98                        # Strip comments from <style> tags. This is needed as
99                        # sometimes there are unterminated comments
100                        (re.compile(r"<\s*style.*?>(.*?)<\/\s*style\s*>", re.DOTALL|re.IGNORECASE),
101                         lambda match: match.group().replace('<!--', '').replace('-->', '')),
102                        # remove <p> tags from within <a href> tags
103                        (re.compile(r'<\s*a\s+[^<>]*href\s*=[^<>]*>(.*?)<\s*/\s*a\s*>', re.DOTALL|re.IGNORECASE),
104                         lambda match: re.compile(r'%(open)s|%(close)s'%tag_regex('p'), re.IGNORECASE).sub('', match.group())),
105
106                        # Replace common line break patterns with line breaks
107                        (re.compile(r'<p>(&nbsp;|\s)*</p>', re.IGNORECASE), lambda m: '<br />'),
108
109                        # Replace empty headers with line breaks
110                        (re.compile(r'<h[0-5]?>(&nbsp;|\s)*</h[0-5]?>',
111                                    re.IGNORECASE), lambda m: '<br />'),
112
113                        # Replace entities
114                        (re.compile(r'&(\S+?);'), partial(entity_to_unicode,
115                                                           exceptions=['lt', 'gt', 'amp', 'quot'])),
116                        # Remove comments from within style tags as they can mess up BeatifulSoup
117                        (re.compile(r'(<style.*?</style>)', re.IGNORECASE|re.DOTALL),
118                         strip_style_comments),
119
120                        # Remove self closing script tags as they also mess up BeautifulSoup
121                        (re.compile(r'(?i)<script[^<>]+?/>'), lambda match: ''),
122
123                        # BeautifulSoup treats self closing <div> tags as open <div> tags
124                        (re.compile(r'(?i)<\s*div([^>]*)/\s*>'),
125                         lambda match: '<div%s></div>'%match.group(1))
126
127                        ]
128    # Fix Baen markup
129    BAEN = [
130                     (re.compile(r'page-break-before:\s*\w+([\s;\}])', re.IGNORECASE),
131                      lambda match: match.group(1)),
132                     (re.compile(r'<p>\s*(<a id.*?>\s*</a>)\s*</p>', re.IGNORECASE),
133                      lambda match: match.group(1)),
134                     (re.compile(r'<\s*a\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*</a>', re.IGNORECASE),
135                      lambda match: ''),
136                     ]
137    # Fix pdftohtml markup
138    PDFTOHTML  = [
139                  # Remove <hr> tags
140                  (re.compile(r'<hr.*?>', re.IGNORECASE), lambda match: '<br />'),
141                  # Remove page numbers
142                  (re.compile(r'\d+<br>', re.IGNORECASE), lambda match: ''),
143                  # Remove <br> and replace <br><br> with <p>
144                  (re.compile(r'<br.*?>\s*<br.*?>', re.IGNORECASE), lambda match: '<p>'),
145                  (re.compile(r'(.*)<br.*?>', re.IGNORECASE),
146                   lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40
147                                else match.group(1)),
148                  # Remove hyphenation
149                  (re.compile(r'-\n\r?'), lambda match: ''),
150
151                  ]
152
153    # Fix Book Designer markup
154    BOOK_DESIGNER = [
155                     # HR
156                     (re.compile('<hr>', re.IGNORECASE),
157                      lambda match : '<span style="page-break-after:always"> </span>'),
158                     # Create header tags
159                     (re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
160                      lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
161                     (re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
162                      lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
163                     (re.compile(r'<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
164                      lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
165                     (re.compile(r'<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
166                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
167                     # Blank lines
168                     (re.compile(r'<div[^><]*?>(&nbsp;){4}</div>', re.IGNORECASE),
169                      lambda match : '<p></p>'),
170                     ]
171
172    def __hasattr__(self, attr):
173        if hasattr(self.options, attr):
174            return True
175        return object.__hasattr__(self, attr)
176
177    def __getattr__(self, attr):
178        if hasattr(self.options, attr):
179            return getattr(self.options, attr)
180        return object.__getattribute__(self, attr)
181
182    def __setattr__(self, attr, val):
183        if hasattr(self.options, attr):
184            setattr(self.options, attr, val)
185        else:
186            object.__setattr__(self, attr, val)
187
188    CSS = {
189           'h1'     : {"font-size"   : "xx-large", "font-weight":"bold", 'text-indent':'0pt'},
190           'h2'     : {"font-size"   : "x-large", "font-weight":"bold", 'text-indent':'0pt'},
191           'h3'     : {"font-size"   : "large", "font-weight":"bold", 'text-indent':'0pt'},
192           'h4'     : {"font-size"   : "large", 'text-indent':'0pt'},
193           'h5'     : {"font-weight" : "bold", 'text-indent':'0pt'},
194           'b'      : {"font-weight" : "bold"},
195           'strong' : {"font-weight" : "bold"},
196           'i'      : {"font-style"  : "italic"},
197           'cite'   : {'font-style'  : 'italic'},
198           'em'     : {"font-style"  : "italic"},
199           'small'  : {'font-size'   : 'small'},
200           'pre'    : {'font-family' : 'monospace', 'white-space': 'pre'},
201           'code'   : {'font-family' : 'monospace'},
202           'tt'     : {'font-family' : 'monospace'},
203           'center' : {'text-align'  : 'center'},
204           'th'     : {'font-size'   : 'large', 'font-weight':'bold'},
205           'big'    : {'font-size'   : 'large', 'font-weight':'bold'},
206           '.libprs500_dropcaps' : {'font-size': 'xx-large'},
207           'u'      : {'text-decoration': 'underline'},
208           'sup'    : {'vertical-align': 'super', 'font-size': '60%'},
209           'sub'    : {'vertical-align': 'sub', 'font-size': '60%'},
210           }
211
212    def __init__(self, book, fonts, options, logger, paths):
213        '''
214        Convert HTML files at C{paths} and add to C{book}. After creating
215        the object, you must call L{self.writeto} to output the LRF/S file.
216
217        @param book: The LRF book
218        @type book:  L{lrf.pylrs.Book}
219        @param fonts: dict specifying the font families to use
220        '''
221        # Defaults for various formatting tags
222        object.__setattr__(self, 'options', options)
223        self.log = logger
224        self.fonts = fonts  # : dict specifying font families to use
225        # Memory
226        self.scaled_images    = {}    #: Temporary files with scaled version of images
227        self.rotated_images   = {}    #: Temporary files with rotated version of images
228        self.text_styles      = []    #: Keep track of already used textstyles
229        self.block_styles     = []    #: Keep track of already used blockstyles
230        self.images  = {}      #: Images referenced in the HTML document
231        self.targets = {}      #: <a name=...> and id elements
232        self.links   = deque()  # : <a href=...> elements
233        self.processed_files = []
234        self.extra_toc_entries = []  # : TOC entries gleaned from semantic information
235        self.image_memory = []
236        self.id_counter = 0
237        self.unused_target_blocks = []  # : Used to remove extra TextBlocks
238        self.link_level  = 0    #: Current link level
239        self.memory = []        #: Used to ensure that duplicate CSS unhandled errors are not reported
240        self.tops = {}          #: element representing the top of each HTML file in the LRF file
241        self.previous_text = ''  # : Used to figure out when to lstrip
242        self.stripped_space = ''
243        self.preserve_block_style = False  # : Used so that <p> tags in <blockquote> elements are handled properly
244        self.avoid_page_break = False
245        self.current_page = book.create_page()
246
247        # Styles
248        self.blockquote_style = book.create_block_style(sidemargin=60,
249                                                        topskip=20, footskip=20)
250        self.unindented_style = book.create_text_style(parindent=0)
251
252        self.in_table = False
253        # List processing
254        self.list_level = 0
255        self.list_indent = 20
256        self.list_counter = 1
257
258        self.book = book                #: The Book object representing a BBeB book
259
260        self.override_css = {}
261        self.override_pcss = {}
262
263        if self._override_css is not None:
264            if os.access(self._override_css, os.R_OK):
265                with open(self._override_css, 'rb') as f:
266                    src = f.read()
267            else:
268                src = self._override_css
269            if isinstance(src, bytes):
270                src = src.decode('utf-8', 'replace')
271            match = self.PAGE_BREAK_PAT.search(src)
272            if match and not re.match('avoid', match.group(1), re.IGNORECASE):
273                self.page_break_found = True
274            ncss, npcss = self.parse_css(src)
275            if ncss:
276                update_css(ncss, self.override_css)
277            if npcss:
278                update_css(npcss, self.override_pcss)
279
280        paths = [os.path.abspath(path) for path in paths]
281        paths = [path.decode(sys.getfilesystemencoding()) if not isinstance(path, str) else path for path in paths]
282
283        while len(paths) > 0 and self.link_level <= self.link_levels:
284            for path in paths:
285                if path in self.processed_files:
286                    continue
287                try:
288                    self.add_file(path)
289                except KeyboardInterrupt:
290                    raise
291                except:
292                    if self.link_level == 0:  # Die on errors in the first level
293                        raise
294                    for link in self.links:
295                        if link['path'] == path:
296                            self.links.remove(link)
297                            break
298                    self.log.warn('Could not process '+path)
299                    if self.verbose:
300                        self.log.exception(' ')
301            self.links = self.process_links()
302            self.link_level += 1
303            paths = [link['path'] for link in self.links]
304
305        if self.current_page is not None and self.current_page.has_text():
306            self.book.append(self.current_page)
307
308        for text, tb in self.extra_toc_entries:
309            self.book.addTocEntry(text, tb)
310
311        if self.base_font_size > 0:
312            self.log.info('\tRationalizing font sizes...')
313            self.book.rationalize_font_sizes(self.base_font_size)
314
315    def is_baen(self, soup):
316        return bool(soup.find('meta', attrs={'name':'Publisher',
317                        'content':re.compile('Baen', re.IGNORECASE)}))
318
319    def is_book_designer(self, raw):
320        return bool(re.search('<H2[^><]*id=BookTitle', raw))
321
322    def preprocess(self, raw):
323        nmassage = []
324        nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
325
326        if not self.book_designer and self.is_book_designer(raw):
327            self.book_designer = True
328            self.log.info(_('\tBook Designer file detected.'))
329
330        self.log.info(_('\tParsing HTML...'))
331
332        if self.baen:
333            nmassage.extend(HTMLConverter.BAEN)
334
335        if self.pdftohtml:
336            nmassage.extend(HTMLConverter.PDFTOHTML)
337        if self.book_designer:
338            nmassage.extend(HTMLConverter.BOOK_DESIGNER)
339        if isinstance(raw, bytes):
340            raw = xml_to_unicode(raw, replace_entities=True)[0]
341        for pat, repl in nmassage:
342            raw = pat.sub(repl, raw)
343        soup = BeautifulSoup(raw)
344        if not self.baen and self.is_baen(soup):
345            self.baen = True
346            self.log.info(_('\tBaen file detected. Re-parsing...'))
347            return self.preprocess(raw)
348        if self.book_designer:
349            t = soup.find(id='BookTitle')
350            if t:
351                self.book.set_title(self.get_text(t))
352            a = soup.find(id='BookAuthor')
353            if a:
354                self.book.set_author(self.get_text(a))
355        if self.verbose:
356            tdir = tempfile.gettempdir()
357            if not os.path.exists(tdir):
358                os.makedirs(tdir)
359            try:
360                with open(os.path.join(tdir, 'html2lrf-verbose.html'), 'wb') as f:
361                    f.write(str(soup).encode('utf-8'))
362                    self.log.info(_('Written preprocessed HTML to ')+f.name)
363            except:
364                pass
365
366        return soup
367
368    def add_file(self, path):
369        self.css = HTMLConverter.CSS.copy()
370        self.pseudo_css = self.override_pcss.copy()
371        for selector in self.override_css:
372            if selector in self.css:
373                self.css[selector].update(self.override_css[selector])
374            else:
375                self.css[selector] = self.override_css[selector]
376
377        self.file_name = os.path.basename(path)
378        self.log.info(_('Processing %s')%(path if self.verbose else self.file_name))
379
380        if not os.path.exists(path):
381            path = path.replace('&', '%26')  # convertlit replaces & with %26 in file names
382        with open(path, 'rb') as f:
383            raw = f.read()
384        if self.pdftohtml:  # Bug in pdftohtml that causes it to output invalid UTF-8 files
385            raw = raw.decode('utf-8', 'ignore')
386        elif self.encoding is not None:
387            raw = raw.decode(self.encoding, 'ignore')
388        else:
389            raw = xml_to_unicode(raw, self.verbose)[0]
390        soup = self.preprocess(raw)
391        self.log.info(_('\tConverting to BBeB...'))
392        self.current_style = {}
393        self.page_break_found = False
394        if not isinstance(path, str):
395            path = path.decode(sys.getfilesystemencoding())
396        self.target_prefix = path
397        self.previous_text = '\n'
398        self.tops[path] = self.parse_file(soup)
399        self.processed_files.append(path)
400
401    def parse_css(self, style):
402        """
403        Parse the contents of a <style> tag or .css file.
404        @param style: C{str(style)} should be the CSS to parse.
405        @return: A dictionary with one entry per selector where the key is the
406        selector name and the value is a dictionary of properties
407        """
408        sdict, pdict = {}, {}
409        style = re.sub(r'/\*.*?\*/', '', style)  # Remove /*...*/ comments
410        for sel in re.findall(HTMLConverter.SELECTOR_PAT, style):
411            for key in sel[0].split(','):
412                val = self.parse_style_properties(sel[1])
413                key = key.strip().lower()
414                if '+' in key:
415                    continue
416                if ':' in key:
417                    key, sep, pseudo = key.partition(':')
418                    if key in pdict:
419                        if pseudo in pdict[key]:
420                            pdict[key][pseudo].update(val)
421                        else:
422                            pdict[key][pseudo] = val
423                    else:
424                        pdict[key] = {pseudo:val}
425                else:
426                    if key in sdict:
427                        sdict[key].update(val)
428                    else:
429                        sdict[key] = val
430        return sdict, pdict
431
432    def parse_style_properties(self, props):
433        """
434        Parses a style attribute. The code within a CSS selector block or in
435        the style attribute of an HTML element.
436        @return: A dictionary with one entry for each property where the key
437                is the property name and the value is the property value.
438        """
439        prop = dict()
440        for s in props.split(';'):
441            l = s.split(':',1)
442            if len(l)==2:
443                key = l[0].strip().lower()
444                val = l[1].strip()
445                prop[key] = val
446        return prop
447
448    def tag_css(self, tag, parent_css={}):
449        """
450        Return a dictionary of style properties applicable to Tag tag.
451        """
452        def merge_parent_css(prop, pcss):
453            # float should not be inherited according to the CSS spec
454            # however we need to as we don't do alignment at a block level.
455            # float is removed by the process_alignment function.
456            inherited = ['text-align', 'float', 'white-space', 'color',
457                         'line-height', 'vertical-align']
458            temp = {}
459            for key in pcss.keys():
460                chk = key.lower()
461                # float should not be inherited according to the CSS spec
462                # however we need to as we don't do alignment at a block level.
463                # float is removed by the process_alignment function.
464                if chk.startswith('font') or chk in inherited:
465                    temp[key] = pcss[key]
466            prop.update(temp)
467
468        prop, pprop = {}, {}
469        tagname = tag.name.lower()
470        if parent_css:
471            merge_parent_css(prop, parent_css)
472        if tag.has_attr("align"):
473            al = tag['align'].lower()
474            if al in ('left', 'right', 'center', 'justify'):
475                prop["text-align"] = al
476        if tagname in self.css:
477            prop.update(self.css[tagname])
478        if tagname in self.pseudo_css:
479            pprop.update(self.pseudo_css[tagname])
480        if tag.has_attr("class"):
481            cls = tag['class']
482            if isinstance(cls, list):
483                cls = ' '.join(cls)
484            cls = cls.lower()
485            for cls in cls.split():
486                for classname in ["."+cls, tagname+"."+cls]:
487                    if classname in self.css:
488                        prop.update(self.css[classname])
489                    if classname in self.pseudo_css:
490                        pprop.update(self.pseudo_css[classname])
491        if tag.has_attr('id') and tag['id'] in self.css:
492            prop.update(self.css[tag['id']])
493        if tag.has_attr("style"):
494            prop.update(self.parse_style_properties(tag["style"]))
495        return prop, pprop
496
497    def parse_file(self, soup):
498        def get_valid_block(page):
499            for item in page.contents:
500                if isinstance(item, (Canvas, TextBlock, ImageBlock, RuledLine)):
501                    if isinstance(item, TextBlock) and not item.contents:
502                        continue
503                    return item
504        if not self.current_page:
505            self.current_page = self.book.create_page()
506        self.current_block = self.book.create_text_block()
507        self.current_para = Paragraph()
508        if self.cover:
509            self.add_image_page(self.cover)
510            self.cover = None
511        top = self.current_block
512        self.current_block.must_append = True
513
514        self.soup = soup
515        self.process_children(soup, {}, {})
516        self.soup = None
517
518        if self.current_para and self.current_block:
519            self.current_para.append_to(self.current_block)
520        if self.current_block and self.current_page:
521            self.current_block.append_to(self.current_page)
522        if self.avoid_page_break:
523            self.avoid_page_break = False
524        elif self.current_page and self.current_page.has_text():
525            self.book.append(self.current_page)
526            self.current_page = None
527
528        if top not in top.parent.contents:  # May have been removed for a cover image
529            top = top.parent.contents[0]
530        if not top.has_text() and top.parent.contents.index(top) == len(top.parent.contents)-1:
531            # Empty block at the bottom of a page
532            opage = top.parent
533            top.parent.contents.remove(top)
534            if self.book.last_page() is opage:
535                if self.current_page and self.current_page.has_text():
536                    for c in self.current_page.contents:
537                        if isinstance(c, (TextBlock, ImageBlock)):
538                            return c
539                raise ConversionError(_('Could not parse file: %s')%self.file_name)
540            else:
541                try:
542                    index = self.book.pages().index(opage)
543                except ValueError:
544                    self.log.warning(_('%s is an empty file')%self.file_name)
545                    tb = self.book.create_text_block()
546                    self.current_page.append(tb)
547                    return tb
548                for page in list(self.book.pages()[index+1:]):
549                    for c in page.contents:
550                        if isinstance(c, (TextBlock, ImageBlock, Canvas)):
551                            return c
552                raise ConversionError(_('Could not parse file: %s')%self.file_name)
553
554        return top
555
556    def create_link(self, children, tag):
557        para = None
558        for i in range(len(children)-1, -1, -1):
559            if isinstance(children[i], (Span, EmpLine)):
560                para = children[i]
561                break
562        if para is None:
563            raise ConversionError(
564                _('Failed to parse link %(tag)s %(children)s')%dict(
565                    tag=tag, children=children))
566        text = self.get_text(tag, 1000)
567        if not text:
568            text = 'Link'
569            img = tag.find('img')
570            if img:
571                try:
572                    text = img['alt']
573                except KeyError:
574                    pass
575
576        path, fragment = munge_paths(self.target_prefix, tag['href'])
577        return {'para':para, 'text':text, 'path':os.path.abspath(path),
578                'fragment':fragment, 'in toc': (self.link_level == 0 and
579                    not self.use_spine and not self.options.no_links_in_toc)}
580
581    def get_text(self, tag, limit=None):
582        css = self.tag_css(tag)[0]
583        if ('display' in css and css['display'].lower() == 'none') or ('visibility' in css and css['visibility'].lower() == 'hidden'):
584            return ''
585        text, alt_text = '', ''
586        for c in tag.contents:
587            if limit is not None and len(text) > limit:
588                break
589            if isinstance(c, HTMLConverter.IGNORED_TAGS):
590                continue
591            if isinstance(c, NavigableString):
592                text += str(c)
593            elif isinstance(c, Tag):
594                if c.name.lower() == 'img' and c.has_attr('alt'):
595                    alt_text += c['alt']
596                    continue
597                text += self.get_text(c)
598        return text if text.strip() else alt_text
599
600    def process_links(self):
601        def add_toc_entry(text, target):
602            # TextBlocks in Canvases have a None parent or an Objects Parent
603            if target.parent is not None and \
604               hasattr(target.parent, 'objId'):
605                self.book.addTocEntry(ascii_text, tb)
606            else:
607                self.log.debug("Cannot add link %s to TOC"%ascii_text)
608
609        def get_target_block(fragment, targets):
610            '''Return the correct block for the <a name> element'''
611            bs = targets[fragment]
612            if not isinstance(bs, BlockSpace):
613                return bs
614            ans, found, page = None, False, bs.parent
615            for item in page.contents:
616                if found:
617                    if isinstance(item, (TextBlock, RuledLine, ImageBlock)):
618                        ans = item
619                        break
620                if item == bs:
621                    found = True
622                    continue
623
624            if not ans:
625                for i in range(len(page.contents)-1, -1, -1):
626                    if isinstance(page.contents[i], (TextBlock, RuledLine, ImageBlock)):
627                        ans = page.contents[i]
628                        break
629
630            if not ans:
631                ntb = self.book.create_text_block()
632                ntb.Paragraph(' ')
633                page.append(ntb)
634                ans = ntb
635
636            if found:
637                targets[fragment] =  ans
638                page.contents.remove(bs)
639            return ans
640
641        outside_links = deque()
642        while len(self.links) > 0:
643            link = self.links.popleft()
644            para, text, path, fragment = link['para'], link['text'], link['path'], link['fragment']
645            ascii_text = text
646
647            if not isinstance(path, str):
648                path = path.decode(sys.getfilesystemencoding())
649            if path in self.processed_files:
650                if path+fragment in self.targets.keys():
651                    tb = get_target_block(path+fragment, self.targets)
652                else:
653                    tb = self.tops[path]
654                if link['in toc']:
655                    add_toc_entry(ascii_text, tb)
656
657                jb = JumpButton(tb)
658                self.book.append(jb)
659                cb = CharButton(jb, text=text)
660                para.contents = []
661                para.append(cb)
662                try:
663                    self.unused_target_blocks.remove(tb)
664                except ValueError:
665                    pass
666            else:
667                outside_links.append(link)
668
669        return outside_links
670
671    def create_toc(self, toc):
672        for item in toc.top_level_items():
673            ascii_text = item.text
674            if not item.fragment and item.abspath in self.tops:
675                self.book.addTocEntry(ascii_text, self.tops[item.abspath])
676            elif item.abspath:
677                url = item.abspath+(item.fragment if item.fragment else '')
678                if url in self.targets:
679                    self.book.addTocEntry(ascii_text, self.targets[url])
680
681    def end_page(self):
682        """
683        End the current page, ensuring that any further content is displayed
684        on a new page.
685        """
686        if self.current_para.has_text():
687            self.current_para.append_to(self.current_block)
688            self.current_para = Paragraph()
689        if self.current_block.has_text() or self.current_block.must_append:
690            self.current_block.append_to(self.current_page)
691            self.current_block = self.book.create_text_block()
692        if self.current_page.has_text():
693            self.book.append(self.current_page)
694            self.current_page = self.book.create_page()
695
696    def add_image_page(self, path):
697        if os.access(path, os.R_OK):
698            self.end_page()
699            pwidth, pheight = self.profile.screen_width, self.profile.screen_height - \
700                              self.profile.fudge
701            page = self.book.create_page(evensidemargin=0, oddsidemargin=0,
702                                         topmargin=0, textwidth=pwidth,
703                                         headheight=0, headsep=0, footspace=0,
704                                         footheight=0,
705                                         textheight=pheight)
706            if path not in self.images:
707                self.images[path] = ImageStream(path)
708            im = PILImage.open(path)
709            width, height = im.size
710            canvas = Canvas(pwidth, pheight)
711            ib = ImageBlock(self.images[path], x1=width,
712                            y1=height, xsize=width, ysize=height,
713                            blockwidth=width, blockheight=height)
714            canvas.put_object(ib, int((pwidth-width)/2.), int((pheight-height)/2.))
715            page.append(canvas)
716            self.book.append(page)
717
718    def process_children(self, ptag, pcss, ppcss={}):
719        """ Process the children of ptag """
720        # Need to make a copy of contents as when
721        # extract is called on a child, it will
722        # mess up the iteration.
723        for c in copy.copy(ptag.contents):
724            if isinstance(c, HTMLConverter.IGNORED_TAGS):
725                continue
726            elif isinstance(c, Tag):
727                self.parse_tag(c, pcss)
728            elif isinstance(c, NavigableString):
729                self.add_text(c, pcss, ppcss)
730        if not self.in_table:
731            try:
732                if self.minimize_memory_usage:
733                    ptag.extract()
734            except AttributeError:
735                print(ptag, type(ptag))
736
737    def get_alignment(self, css):
738        val = css['text-align'].lower() if 'text-align' in css else None
739        align = 'head'
740        if val is not None:
741            if val in ["right", "foot"]:
742                align = "foot"
743            elif val == "center":
744                align = "center"
745        if 'float' in css:
746            val = css['float'].lower()
747            if val == 'left':
748                align = 'head'
749            if val == 'right':
750                align = 'foot'
751            css.pop('float')
752        return align
753
754    def process_alignment(self, css):
755        '''
756        Create a new TextBlock only if necessary as indicated by css
757        @type css: dict
758        '''
759        align = self.get_alignment(css)
760        if align != self.current_block.textStyle.attrs['align']:
761            self.current_para.append_to(self.current_block)
762            self.current_block.append_to(self.current_page)
763            ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
764            ts.attrs['align'] = align
765            try:
766                index = self.text_styles.index(ts)
767                ts = self.text_styles[index]
768            except ValueError:
769                self.text_styles.append(ts)
770            self.current_block = self.book.create_text_block(
771                                blockStyle=self.current_block.blockStyle,
772                                textStyle=ts)
773            self.current_para = Paragraph()
774            return True
775        return False
776
777    def add_text(self, tag, css, pseudo_css, force_span_use=False):
778        '''
779        Add text to the current paragraph taking CSS into account.
780        @param tag: Either a BeautifulSoup tag or a string
781        @param css: A dict
782        '''
783        src = tag.string if hasattr(tag, 'string') else tag
784        if len(src) > 32760:
785            pos = 0
786            while pos < len(src):
787                self.add_text(src[pos:pos+32760], css, pseudo_css, force_span_use)
788                pos += 32760
789            return
790        src = src.replace('\r\n', '\n').replace('\r', '\n')
791
792        if 'first-letter' in pseudo_css and len(src) > 1:
793            src = src.lstrip()
794            f = src[0]
795            next = 1
796            if f in ("'", '"', '\u201c', '\u2018', '\u201d', '\u2019'):
797                if len(src) >= 2:
798                    next = 2
799                    f = src[:2]
800            src = src[next:]
801            ncss = css.copy()
802            ncss.update(pseudo_css.pop('first-letter'))
803            self.add_text(f, ncss, {}, force_span_use)
804
805        collapse_whitespace = 'white-space' not in css or css['white-space'] != 'pre'
806        if self.process_alignment(css) and collapse_whitespace:
807            # Dont want leading blanks in a new paragraph
808            src = src.lstrip()
809
810        def append_text(src):
811            fp, key, variant = self.font_properties(css)
812            for x, y in [('\xad', ''), ('\xa0', ' '), ('\ufb00', 'ff'), ('\ufb01', 'fi'), ('\ufb02', 'fl'), ('\ufb03', 'ffi'), ('\ufb04', 'ffl')]:
813                src = src.replace(x, y)
814
815            valigner = lambda x: x
816            if 'vertical-align' in css:
817                valign = css['vertical-align']
818                if valign in ('sup', 'super', 'sub'):
819                    fp['fontsize'] = int(fp['fontsize']) * 5 // 3
820                    valigner = Sub if valign == 'sub' else Sup
821            normal_font_size = int(fp['fontsize'])
822
823            if variant == 'small-caps':
824                dump = Span(fontsize=normal_font_size-30)
825                temp = []
826                for c in src:
827                    if c.isupper():
828                        if temp:
829                            dump.append(valigner(''.join(temp)))
830                            temp = []
831                        dump.append(Span(valigner(c), fontsize=normal_font_size))
832                    else:
833                        temp.append(c.upper())
834                src = dump
835                if temp:
836                    src.append(valigner(''.join(temp)))
837            else:
838                src = valigner(src)
839
840            if key in ['italic', 'bi']:
841                already_italic = False
842                for fonts in self.fonts.values():
843                    it = fonts['italic'][1] if 'italic' in fonts else ''
844                    bi = fonts['bi'][1] if 'bi' in fonts else ''
845                    if fp['fontfacename'] in (it, bi):
846                        already_italic = True
847                        break
848                if not already_italic:
849                    src = Italic(src)
850
851            unneeded = []
852            for prop in fp:
853                if fp[prop] == self.current_block.textStyle.attrs[prop]:
854                    unneeded.append(prop)
855            for prop in unneeded:
856                fp.pop(prop)
857            attrs = {}
858            if 'color' in css and not self.ignore_colors:
859                attrs['textcolor'] = lrs_color(css['color'])
860            attrs.update(fp)
861            elem = Span(text=src, **attrs) if (attrs or force_span_use) else src
862            if 'text-decoration' in css:
863                dec = css['text-decoration'].lower()
864                linepos = 'after' if dec == 'underline' else 'before' if dec == 'overline' else None
865                if linepos is not None:
866                    elem = EmpLine(elem, emplineposition=linepos)
867            self.current_para.append(elem)
868
869        if collapse_whitespace:
870            src = re.sub(r'\s{1,}', ' ', src)
871            if self.stripped_space and len(src) == len(src.lstrip(' \n\r\t')):
872                src = self.stripped_space + src
873            src, orig = src.rstrip(' \n\r\t'), src
874            self.stripped_space = orig[len(src):]
875            if len(self.previous_text) != len(self.previous_text.rstrip(' \n\r\t')):
876                src = src.lstrip(' \n\r\t')
877            if len(src):
878                self.previous_text = src
879                append_text(src)
880        else:
881            srcs = src.split('\n')
882            for src in srcs[:-1]:
883                append_text(src)
884                self.line_break()
885            last = srcs[-1]
886            if len(last):
887                append_text(last)
888
889    def line_break(self):
890        self.current_para.append(CR())
891        self.previous_text = '\n'
892
893    def end_current_para(self):
894        '''
895        End current paragraph with a paragraph break after it.
896        '''
897        if self.current_para.contents:
898            self.current_block.append(self.current_para)
899        self.current_block.append(CR())
900        self.current_para = Paragraph()
901
902    def end_current_block(self):
903        '''
904        End current TextBlock. Create new TextBlock with the same styles.
905        '''
906        if self.current_para.contents:
907            self.current_block.append(self.current_para)
908            self.current_para = Paragraph()
909        if self.current_block.contents or self.current_block.must_append:
910            self.current_page.append(self.current_block)
911            self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
912                                                         blockStyle=self.current_block.blockStyle)
913
914    def process_image(self, path, tag_css, width=None, height=None,
915                      dropcaps=False, rescale=False):
916        def detect_encoding(im):
917            fmt = im.format
918            if fmt == 'JPG':
919                fmt = 'JPEG'
920            return fmt
921        original_path = path
922        if path in self.rotated_images:
923            path = self.rotated_images[path].name
924        if path in self.scaled_images:
925            path = self.scaled_images[path].name
926
927        try:
928            im = PILImage.open(path)
929        except OSError as err:
930            self.log.warning('Unable to process image: %s\n%s'%(original_path, err))
931            return
932        encoding = detect_encoding(im)
933
934        def scale_image(width, height):
935            if width <= 0:
936                width = 1
937            if height <= 0:
938                height = 1
939            pt = PersistentTemporaryFile(suffix='_html2lrf_scaled_image_.'+encoding.lower())
940            self.image_memory.append(pt)  # Necessary, trust me ;-)
941            try:
942                im.resize((int(width), int(height)), PILImage.ANTIALIAS).save(pt, encoding)
943                pt.close()
944                self.scaled_images[path] = pt
945                return pt.name
946            except (OSError, SystemError) as err:  # PIL chokes on interlaced PNG images as well a some GIF images
947                self.log.warning(
948                    _('Unable to process image %(path)s. Error: %(err)s')%dict(
949                        path=path, err=err))
950
951        if width is None or height is None:
952            width, height = im.size
953        elif rescale and (width < im.size[0] or height < im.size[1]):
954            path = scale_image(width, height)
955            if not path:
956                return
957
958        factor = 720./self.profile.dpi
959        pheight = int(self.current_page.pageStyle.attrs['textheight'])
960        pwidth  = int(self.current_page.pageStyle.attrs['textwidth'])
961
962        if dropcaps:
963            scale = False
964            if width > 0.75*pwidth:
965                width = int(0.75*pwidth)
966                scale = True
967            if height > 0.75*pheight:
968                height = int(0.75*pheight)
969                scale = True
970            if scale:
971                path = scale_image(width, height)
972            if path not in self.images:
973                self.images[path] = ImageStream(path)
974            im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,
975                               xsize=width, ysize=height)
976            line_height = (int(self.current_block.textStyle.attrs['baselineskip']) +
977                            int(self.current_block.textStyle.attrs['linespace']))//10
978            line_height *= self.profile.dpi/72
979            lines = int(ceil(height/line_height))
980            dc = DropCaps(lines)
981            dc.append(Plot(im, xsize=ceil(width*factor), ysize=ceil(height*factor)))
982            self.current_para.append(dc)
983            return
984
985        if self.autorotation and width > pwidth and width > height:
986            pt = PersistentTemporaryFile(suffix='_html2lrf_rotated_image_.'+encoding.lower())
987            try:
988                im = im.rotate(90)
989                im.save(pt, encoding)
990                path = pt.name
991                self.rotated_images[path] = pt
992                width, height = im.size
993            except OSError:  # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
994                self.log.debug(_('Unable to process interlaced PNG %s')% original_path)
995            finally:
996                pt.close()
997
998        scaled, width, height = fit_image(width, height, pwidth, pheight)
999        if scaled:
1000            path = scale_image(width, height)
1001
1002        if not path:
1003            return
1004
1005        if path not in self.images:
1006            try:
1007                self.images[path] = ImageStream(path, encoding=encoding)
1008            except LrsError as err:
1009                self.log.warning(('Could not process image: %s\n%s')%(
1010                    original_path, err))
1011                return
1012
1013        im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,
1014                               xsize=width, ysize=height)
1015
1016        self.process_alignment(tag_css)
1017
1018        if max(width, height) <= min(pwidth, pheight)/5:
1019            self.current_para.append(Plot(im, xsize=ceil(width*factor),
1020                                          ysize=ceil(height*factor)))
1021        elif height <= int(floor((2/3)*pheight)):
1022            pb = self.current_block
1023            self.end_current_para()
1024            self.process_alignment(tag_css)
1025            self.current_para.append(Plot(im, xsize=width*factor,
1026                                          ysize=height*factor))
1027            self.current_block.append(self.current_para)
1028            self.current_page.append(self.current_block)
1029            self.current_block = self.book.create_text_block(
1030                                            textStyle=pb.textStyle,
1031                                            blockStyle=pb.blockStyle)
1032            self.current_para = Paragraph()
1033        else:
1034            self.end_page()
1035            if len(self.current_page.contents) == 1 and not self.current_page.has_text():
1036                self.current_page.contents[0:1] = []
1037            self.current_page.append(Canvas(width=pwidth,
1038                                            height=height))
1039            left = int(floor((pwidth - width)/2))
1040            self.current_page.contents[-1].put_object(
1041                            ImageBlock(self.images[path], xsize=width,
1042                                       ysize=height, x1=width, y1=height,
1043                                       blockwidth=width, blockheight=height),
1044                            left, 0)
1045
1046    def process_page_breaks(self, tag, tagname, tag_css):
1047        if 'page-break-before' in tag_css.keys():
1048            if tag_css['page-break-before'].lower() != 'avoid':
1049                self.end_page()
1050            tag_css.pop('page-break-before')
1051        end_page = False
1052        if 'page-break-after' in tag_css.keys():
1053            if tag_css['page-break-after'].lower() == 'avoid':
1054                self.avoid_page_break = True
1055            else:
1056                end_page = True
1057            tag_css.pop('page-break-after')
1058        if (self.force_page_break_attr[0].match(tagname) and
1059           tag.has_attr(self.force_page_break_attr[1]) and
1060           self.force_page_break_attr[2].match(tag[self.force_page_break_attr[1]])) or \
1061           self.force_page_break.match(tagname):
1062            self.end_page()
1063            self.page_break_found = True
1064        if not self.page_break_found and self.page_break.match(tagname):
1065            number_of_paragraphs = sum(
1066                len([1 for i in block.contents if isinstance(i, Paragraph)])
1067                for block in self.current_page.contents if isinstance(block, TextBlock)
1068            )
1069
1070            if number_of_paragraphs > 2:
1071                self.end_page()
1072                self.log.debug('Forcing page break at %s'%tagname)
1073        return end_page
1074
1075    def block_properties(self, tag_css):
1076
1077        def get(what):
1078            src = [None for i in range(4)]
1079            if what in tag_css:
1080                msrc = tag_css[what].split()
1081                for i in range(min(len(msrc), len(src))):
1082                    src[i] = msrc[i]
1083            for i, c in enumerate(('-top', '-right', '-bottom', '-left')):
1084                if what + c in tag_css:
1085                    src[i] = tag_css[what+c]
1086            return src
1087
1088        s1, s2 = get('margin'), get('padding')
1089
1090        bl = str(self.current_block.blockStyle.attrs['blockwidth'])+'px'
1091
1092        def set(default, one, two):
1093            fval = None
1094            if one is not None:
1095                val = self.unit_convert(one, base_length='10pt' if 'em' in one else bl)
1096                if val is not None:
1097                    fval = val
1098            if two is not None:
1099                val = self.unit_convert(two, base_length='10pt' if 'em' in two else bl)
1100                if val is not None:
1101                    fval = val if fval is None else fval + val
1102            if fval is None:
1103                fval = default
1104            return fval
1105
1106        ans = {}
1107        ans['topskip']    = set(self.book.defaultBlockStyle.attrs['topskip'], s1[0], s2[0])
1108        ans['footskip']   = set(self.book.defaultBlockStyle.attrs['footskip'], s1[2], s2[2])
1109        ans['sidemargin'] = set(self.book.defaultBlockStyle.attrs['sidemargin'], s1[3], s2[3])
1110
1111        factor = 0.7
1112        if 2*int(ans['sidemargin']) >= factor*int(self.current_block.blockStyle.attrs['blockwidth']):
1113            # Try using (left + right)/2
1114            val = int(ans['sidemargin'])
1115            ans['sidemargin'] = set(self.book.defaultBlockStyle.attrs['sidemargin'], s1[1], s2[1])
1116            val += int(ans['sidemargin'])
1117            val /= 2.
1118            ans['sidemargin'] = int(val)
1119        if 2*int(ans['sidemargin']) >= factor*int(self.current_block.blockStyle.attrs['blockwidth']):
1120            ans['sidemargin'] = int((factor*int(self.current_block.blockStyle.attrs['blockwidth'])) / 2)
1121
1122        for prop in ('topskip', 'footskip', 'sidemargin'):
1123            if isinstance(ans[prop], string_or_bytes):
1124                ans[prop] = int(ans[prop])
1125            if ans[prop] < 0:
1126                ans[prop] = 0
1127
1128        return ans
1129
1130    def font_properties(self, css):
1131        '''
1132        Convert the font propertiess in css to the Xylog equivalents. If the CSS
1133        does not contain a particular font property, the default from self.book.defaultTextSytle
1134        is used. Assumes 1em = 10pt
1135        @return: dict, key, variant. The dict contains the Xlog equivalents. key indicates
1136          the font type (i.e. bold, bi, normal) and variant is None or 'small-caps'
1137        '''
1138        t = {}
1139        for key in ('fontwidth', 'fontsize', 'wordspace', 'fontfacename', 'fontweight', 'baselineskip'):
1140            t[key] = self.book.defaultTextStyle.attrs[key]
1141
1142        def font_weight(val):
1143            ans = 0
1144            m = re.search("([0-9]+)", val)
1145            if m:
1146                ans = int(m.group(1))
1147            elif val.find("bold") >= 0 or val.find("strong") >= 0:
1148                ans = 700
1149            return 'bold' if ans >= 700 else 'normal'
1150
1151        def font_style(val):
1152            ans = 'normal'
1153            if 'italic' in val or 'oblique' in val:
1154                ans = 'italic'
1155            return ans
1156
1157        def font_family(val):
1158            ans = 'serif'
1159            if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0:
1160                ans = 'mono'
1161            elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"),
1162                 val.find("trebuchet"), val.find("sans")) >= 0:
1163                ans = 'sans'
1164            return ans
1165
1166        def font_variant(val):
1167            ans = None
1168            if 'small-caps' in val.lower():
1169                ans = 'small-caps'
1170            return ans
1171
1172        def font_key(family, style, weight):
1173            key = 'normal'
1174            if style == 'italic' and weight == 'normal':
1175                key = 'italic'
1176            elif style == 'normal' and weight == 'bold':
1177                key = 'bold'
1178            elif style == 'italic' and weight == 'bold':
1179                key = 'bi'
1180            return key
1181
1182        def font_size(val):
1183            '''
1184            Assumes 1em=100%=10pt
1185            '''
1186            normal = 100
1187            ans = self.unit_convert(val, pts=True, base_length='10pt')
1188
1189            if ans:
1190                if ans <= 0:
1191                    ans += normal
1192                    if ans == 0:  # Common case of using -1em to mean "smaller"
1193                        ans = int(font_size("smaller"))
1194                    if ans < 0:
1195                        ans = normal
1196            else:
1197                if ans == 0:
1198                    ans = int(font_size("smaller"))
1199                elif "smaller" in val:
1200                    ans = normal - 20
1201                elif "xx-small" in val:
1202                    ans = 40
1203                elif "x-small" in val:
1204                    ans = 60
1205                elif "small" in val:
1206                    ans = 80
1207                elif "medium" in val:
1208                    ans = 100
1209                elif "larger" in val:
1210                    ans = normal + 20
1211                elif "xx-large" in val:
1212                    ans = 180
1213                elif "x-large" in val:
1214                    ans = 140
1215                elif "large" in val:
1216                    ans = 120
1217            if ans is not None:
1218                ans += int(self.font_delta * 20)
1219                ans = str(ans)
1220            return ans
1221
1222        family, weight, style, variant = 'serif', 'normal', 'normal', None
1223        for key in css.keys():
1224            val = css[key].lower()
1225            if key == 'font':
1226                vals = val.split()
1227                for val in vals:
1228                    family = font_family(val)
1229                    if family != 'serif':
1230                        break
1231                for val in vals:
1232                    weight = font_weight(val)
1233                    if weight != 'normal':
1234                        break
1235                for val in vals:
1236                    style = font_style(val)
1237                    if style != 'normal':
1238                        break
1239                for val in vals:
1240                    sz = font_size(val)
1241                    if sz:
1242                        t['fontsize'] = sz
1243                        break
1244                for val in vals:
1245                    variant = font_variant(val)
1246                    if variant:
1247                        t['fontvariant'] = variant
1248                        break
1249            elif key in ['font-family', 'font-name']:
1250                family = font_family(val)
1251            elif key == "font-size":
1252                ans = font_size(val)
1253                if ans:
1254                    t['fontsize'] = ans
1255            elif key == 'font-weight':
1256                weight = font_weight(val)
1257            elif key == 'font-style':
1258                style = font_style(val)
1259            elif key == 'font-variant':
1260                variant = font_variant(val)
1261
1262        if variant:
1263            css['font-variant'] = variant
1264
1265        key = font_key(family, style, weight)
1266        if key in self.fonts[family]:
1267            t['fontfacename'] = self.fonts[family][key][1]
1268        else:
1269            t['fontfacename'] = self.fonts[family]['normal'][1]
1270        if key in ['bold', 'bi']:
1271            t['fontweight'] = 700
1272
1273        fs = int(t['fontsize'])
1274        if fs > 120:
1275            t['wordspace'] = fs // 4
1276        t['baselineskip'] = fs + 20
1277        return t, key, variant
1278
1279    def unit_convert(self, val, pts=False, base_length='10pt'):
1280        '''
1281        Tries to convert html units in C{val} to pixels.
1282        @param pts: If True return 10*pts instead of pixels.
1283        @return: The number of pixels (an int) if successful. Otherwise, returns None.
1284        '''
1285        dpi = self.profile.dpi
1286        result = None
1287        try:
1288            result = int(val)
1289        except ValueError:
1290            pass
1291        m = re.search(r"\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|dpt|pt|pc)", val)
1292
1293        if m is not None and m.group(1):
1294            unit = float(m.group(1))
1295            if m.group(2) == '%':
1296                normal = self.unit_convert(base_length)
1297                result = (unit/100) * normal
1298            elif m.group(2) == 'px':
1299                result = unit
1300            elif m.group(2) == 'in':
1301                result = unit * dpi
1302            elif m.group(2) == 'pt':
1303                result = unit * dpi/72
1304            elif m.group(2) == 'dpt':
1305                result = unit * dpi/720
1306            elif m.group(2) == 'em':
1307                normal = self.unit_convert(base_length)
1308                result = unit * normal
1309            elif m.group(2) == 'pc':
1310                result = unit * (dpi/72) * 12
1311            elif m.group(2) == 'mm':
1312                result = unit * 0.04 * (dpi)
1313            elif m.group(2) == 'cm':
1314                result = unit * 0.4 * (dpi)
1315        if result is not None:
1316            if pts:
1317                result = int(round(result * (720/dpi)))
1318            else:
1319                result = int(round(result))
1320        return result
1321
1322    def text_properties(self, tag_css):
1323        indent = self.book.defaultTextStyle.attrs['parindent']
1324        if 'text-indent' in tag_css:
1325            bl = str(self.current_block.blockStyle.attrs['blockwidth'])+'px'
1326            if 'em' in tag_css['text-indent']:
1327                bl = '10pt'
1328            indent = self.unit_convert(str(tag_css['text-indent']), pts=True, base_length=bl)
1329            if not indent:
1330                indent = 0
1331            if indent > 0 and indent < 10 * self.minimum_indent:
1332                indent = int(10 * self.minimum_indent)
1333
1334        fp = self.font_properties(tag_css)[0]
1335        fp['parindent'] = indent
1336
1337        if 'line-height' in tag_css:
1338            bls, ls = int(self.book.defaultTextStyle.attrs['baselineskip']), \
1339                      int(self.book.defaultTextStyle.attrs['linespace'])
1340            try:  # See if line-height is a unitless number
1341                val = int(float(tag_css['line-height'].strip()) * (ls))
1342                fp['linespace'] = val
1343            except ValueError:
1344                val = self.unit_convert(tag_css['line-height'], pts=True, base_length='1pt')
1345            if val is not None:
1346                val -= bls
1347                if val >= 0:
1348                    fp['linespace'] = val
1349
1350        return fp
1351
1352    def process_block(self, tag, tag_css):
1353        ''' Ensure padding and text-indent properties are respected '''
1354        text_properties = self.text_properties(tag_css)
1355        block_properties = self.block_properties(tag_css)
1356        indent = (float(text_properties['parindent'])/10) * (self.profile.dpi/72)
1357        margin = float(block_properties['sidemargin'])
1358        # Since we're flattening the block structure, we need to ensure that text
1359        # doesn't go off the left edge of the screen
1360        if indent < 0 and margin + indent < 0:
1361            text_properties['parindent'] = int(-margin * (72/self.profile.dpi) * 10)
1362
1363        align = self.get_alignment(tag_css)
1364
1365        def fill_out_properties(props, default):
1366            for key in default.keys():
1367                if key not in props:
1368                    props[key] = default[key]
1369
1370        fill_out_properties(block_properties, self.book.defaultBlockStyle.attrs)
1371        fill_out_properties(text_properties, self.book.defaultTextStyle.attrs)
1372
1373        def properties_different(dict1, dict2):
1374            for key in dict1.keys():
1375                if dict1[key] != dict2[key]:
1376                    return True
1377            return False
1378
1379        if properties_different(self.current_block.blockStyle.attrs, block_properties) or \
1380           properties_different(self.current_block.textStyle.attrs, text_properties) or\
1381           align != self.current_block.textStyle.attrs['align']:
1382            ts = self.current_block.textStyle.copy()
1383            ts.attrs.update(text_properties)
1384            ts.attrs['align'] = align
1385            bs = self.current_block.blockStyle.copy()
1386            if not self.preserve_block_style:
1387                bs.attrs.update(block_properties)
1388            self.current_block.append_to(self.current_page)
1389            try:
1390                index = self.text_styles.index(ts)
1391                ts = self.text_styles[index]
1392            except ValueError:
1393                self.text_styles.append(ts)
1394            try:
1395                index = self.block_styles.index(bs)
1396                bs = self.block_styles[index]
1397            except ValueError:
1398                self.block_styles.append(bs)
1399            self.current_block = self.book.create_text_block(blockStyle=bs,
1400                                                             textStyle=ts)
1401            return True
1402        return False
1403
1404    def process_anchor(self, tag, tag_css, tag_pseudo_css):
1405        if not self.in_table:  # Anchors in tables are handled separately
1406            key = 'name' if tag.has_attr('name') else 'id'
1407            name = tag[key].replace('#', '')
1408            previous = self.current_block
1409            self.process_children(tag, tag_css, tag_pseudo_css)
1410            target = None
1411
1412            if self.current_block == previous:
1413                self.current_block.must_append = True
1414                target = self.current_block
1415            else:
1416                found = False
1417                for item in self.current_page.contents:
1418                    if item == previous:
1419                        found = True
1420                        continue
1421                    if found:
1422                        target = item
1423                        break
1424                if target and not isinstance(target, (TextBlock, ImageBlock)):
1425                    if isinstance(target, RuledLine):
1426                        target = self.book.create_text_block(textStyle=self.current_block.textStyle,
1427                                                     blockStyle=self.current_block.blockStyle)
1428                        target.Paragraph(' ')
1429                        self.current_page.append(target)
1430                    else:
1431                        target = BlockSpace()
1432                        self.current_page.append(target)
1433                if target is None:
1434                    if self.current_block.has_text():
1435                        target = self.current_block
1436                    else:
1437                        target = self.current_block
1438                        self.current_block.must_append = True
1439            self.targets[self.target_prefix+name] = target
1440        else:
1441            self.process_children(tag, tag_css, tag_pseudo_css)
1442
1443    def parse_tag(self, tag, parent_css):
1444        try:
1445            tagname = tag.name.lower()
1446        except AttributeError:
1447            if not isinstance(tag, HTMLConverter.IGNORED_TAGS):
1448                self.add_text(tag, parent_css, {})
1449            return
1450        tag_css, tag_pseudo_css = self.tag_css(tag, parent_css=parent_css)
1451        try:  # Skip element if its display attribute is set to none
1452            if tag_css['display'].lower() == 'none' or \
1453               tag_css['visibility'].lower() == 'hidden':
1454                return
1455        except KeyError:
1456            pass
1457        if not self.disable_chapter_detection and \
1458           (self.chapter_attr[0].match(tagname) and
1459            (self.chapter_attr[1].lower() == 'none' or
1460             (tag.has_attr(self.chapter_attr[1]) and
1461              self.chapter_attr[2].match(tag[self.chapter_attr[1]])))):
1462            self.log.debug('Detected chapter %s'%tagname)
1463            self.end_page()
1464            self.page_break_found = True
1465
1466            if self.options.add_chapters_to_toc:
1467                self.current_block.must_append = True
1468                self.extra_toc_entries.append((self.get_text(tag,
1469                    limit=1000), self.current_block))
1470
1471        end_page = self.process_page_breaks(tag, tagname, tag_css)
1472        try:
1473            if tagname in ["title", "script", "meta", 'del', 'frameset']:
1474                pass
1475            elif tagname == 'a' and self.link_levels >= 0:
1476                if tag.has_attr('href') and not self.link_exclude.match(tag['href']):
1477                    if urlparse(tag['href'])[0] not in ('', 'file'):
1478                        self.process_children(tag, tag_css, tag_pseudo_css)
1479                    else:
1480                        path = munge_paths(self.target_prefix, tag['href'])[0]
1481                        ext = os.path.splitext(path)[1]
1482                        if ext:
1483                            ext = ext[1:].lower()
1484                        if os.access(path, os.R_OK) and os.path.isfile(path):
1485                            if ext in ['png', 'jpg', 'bmp', 'jpeg']:
1486                                self.process_image(path, tag_css)
1487                            else:
1488                                text = self.get_text(tag, limit=1000)
1489                                if not text.strip():
1490                                    text = "Link"
1491                                self.add_text(text, tag_css, {}, force_span_use=True)
1492                                self.links.append(self.create_link(self.current_para.contents, tag))
1493                                if tag.has_attr('id') or tag.has_attr('name'):
1494                                    key = 'name' if tag.has_attr('name') else 'id'
1495                                    self.targets[self.target_prefix+tag[key]] = self.current_block
1496                                    self.current_block.must_append = True
1497                        else:
1498                            self.log.debug('Could not follow link to '+tag['href'])
1499                            self.process_children(tag, tag_css, tag_pseudo_css)
1500                elif tag.has_attr('name') or tag.has_attr('id'):
1501                    self.process_anchor(tag, tag_css, tag_pseudo_css)
1502                else:
1503                    self.process_children(tag, tag_css, tag_pseudo_css)
1504            elif tagname == 'img':
1505                if tag.has_attr('src'):
1506                    path = munge_paths(self.target_prefix, tag['src'])[0]
1507                    if not os.path.exists(path):
1508                        path = path.replace('&', '%26')  # convertlit replaces & with %26
1509                    if os.access(path, os.R_OK) and os.path.isfile(path):
1510                        width, height = None, None
1511                        try:
1512                            width = int(tag['width'])
1513                            height = int(tag['height'])
1514                        except:
1515                            pass
1516                        dropcaps = tag.get('class') in ('libprs500_dropcaps', ['libprs500_dropcaps'])
1517                        self.process_image(path, tag_css, width, height,
1518                                           dropcaps=dropcaps, rescale=True)
1519                    elif not urlparse(tag['src'])[0]:
1520                        self.log.warn('Could not find image: '+tag['src'])
1521                else:
1522                    self.log.debug("Failed to process: %s"%str(tag))
1523            elif tagname in ['style', 'link']:
1524                ncss, npcss = {}, {}
1525                if tagname == 'style':
1526                    text = ''.join([str(i) for i in tag.findAll(text=True)])
1527                    css, pcss = self.parse_css(text)
1528                    ncss.update(css)
1529                    npcss.update(pcss)
1530                elif (tag.has_attr('type') and tag['type'] in ("text/css", "text/x-oeb1-css") and tag.has_attr('href')):
1531                    path = munge_paths(self.target_prefix, tag['href'])[0]
1532                    try:
1533                        with open(path, 'rb') as f:
1534                            src = f.read().decode('utf-8', 'replace')
1535                        match = self.PAGE_BREAK_PAT.search(src)
1536                        if match and not re.match('avoid', match.group(1), re.IGNORECASE):
1537                            self.page_break_found = True
1538                        ncss, npcss = self.parse_css(src)
1539                    except OSError:
1540                        self.log.warn('Could not read stylesheet: '+tag['href'])
1541                if ncss:
1542                    update_css(ncss, self.css)
1543                    self.css.update(self.override_css)
1544                if npcss:
1545                    update_css(npcss, self.pseudo_css)
1546                    self.pseudo_css.update(self.override_pcss)
1547            elif tagname == 'pre':
1548                self.end_current_para()
1549                self.end_current_block()
1550                self.current_block = self.book.create_text_block()
1551                ts = self.current_block.textStyle.copy()
1552                self.current_block.textStyle = ts
1553                self.current_block.textStyle.attrs['parindent'] = '0'
1554
1555                if tag.contents:
1556                    c = tag.contents[0]
1557                    if isinstance(c, NavigableString):
1558                        c = str(c).replace('\r\n', '\n').replace('\r', '\n')
1559                        if c.startswith('\n'):
1560                            c = c[1:]
1561                            tag.contents[0] = NavigableString(c)
1562                            tag.contents[0].setup(tag)
1563                self.process_children(tag, tag_css, tag_pseudo_css)
1564                self.end_current_block()
1565            elif tagname in ['ul', 'ol', 'dl']:
1566                self.list_level += 1
1567                if tagname == 'ol':
1568                    old_counter = self.list_counter
1569                    self.list_counter = 1
1570                    try:
1571                        self.list_counter = int(tag['start'])
1572                    except:
1573                        pass
1574                prev_bs = self.current_block.blockStyle
1575                self.end_current_block()
1576                attrs = self.current_block.blockStyle.attrs
1577                attrs = attrs.copy()
1578                attrs['sidemargin'] = self.list_indent*self.list_level
1579                bs = self.book.create_block_style(**attrs)
1580                self.current_block = self.book.create_text_block(
1581                                            blockStyle=bs,
1582                                            textStyle=self.unindented_style)
1583                self.process_children(tag, tag_css, tag_pseudo_css)
1584                self.end_current_block()
1585                self.current_block.blockStyle = prev_bs
1586                self.list_level -= 1
1587                if tagname == 'ol':
1588                    self.list_counter = old_counter
1589            elif tagname in ['li', 'dt', 'dd']:
1590                margin = self.list_indent*self.list_level
1591                if tagname == 'dd':
1592                    margin += 80
1593                if int(self.current_block.blockStyle.attrs['sidemargin']) != margin:
1594                    self.end_current_block()
1595                    attrs = self.current_block.blockStyle.attrs
1596                    attrs = attrs.copy()
1597                    attrs['sidemargin'] = margin
1598                    attrs['blockwidth'] = int(attrs['blockwidth']) + margin
1599                    bs = self.book.create_block_style(**attrs)
1600                    self.current_block = self.book.create_text_block(
1601                                            blockStyle=bs,
1602                                            textStyle=self.unindented_style)
1603
1604                if self.current_para.has_text():
1605                    self.line_break()
1606                    self.current_block.append(self.current_para)
1607                self.current_para = Paragraph()
1608                self.previous_text = '\n'
1609                if tagname == 'li':
1610                    in_ol, parent = True, tag.parent
1611                    while parent:
1612                        if parent.name and parent.name.lower() in ['ul', 'ol']:
1613                            in_ol = parent.name.lower() == 'ol'
1614                            break
1615                        parent = parent.parent
1616                    prepend = str(self.list_counter)+'. ' if in_ol else '\u2022' + ' '
1617                    self.current_para.append(Span(prepend))
1618                    self.process_children(tag, tag_css, tag_pseudo_css)
1619                    if in_ol:
1620                        self.list_counter += 1
1621                else:
1622                    self.process_children(tag, tag_css, tag_pseudo_css)
1623            elif tagname == 'blockquote':
1624                self.current_para.append_to(self.current_block)
1625                self.current_block.append_to(self.current_page)
1626                pb = self.current_block
1627                self.current_para = Paragraph()
1628                ts = self.book.create_text_style()
1629                ts.attrs['parindent'] = 0
1630                try:
1631                    index = self.text_styles.index(ts)
1632                    ts = self.text_styles[index]
1633                except ValueError:
1634                    self.text_styles.append(ts)
1635                bs = self.book.create_block_style()
1636                bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \
1637                60, 20, 20
1638                try:
1639                    index = self.block_styles.index(bs)
1640                    bs = self.block_styles[index]
1641                except ValueError:
1642                    self.block_styles.append(bs)
1643                self.current_block = self.book.create_text_block(
1644                                        blockStyle=bs, textStyle=ts)
1645                self.previous_text = '\n'
1646                self.preserve_block_style = True
1647                self.process_children(tag, tag_css, tag_pseudo_css)
1648                self.preserve_block_style = False
1649                self.current_para.append_to(self.current_block)
1650                self.current_block.append_to(self.current_page)
1651                self.current_para = Paragraph()
1652                self.current_block = self.book.create_text_block(textStyle=pb.textStyle,
1653                                                                 blockStyle=pb.blockStyle)
1654            elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
1655                new_block = self.process_block(tag, tag_css)
1656
1657                if (self.anchor_ids and tag.has_attr('id')) or (self.book_designer and tag.get('class') in ('title', ['title'])):
1658                    if not tag.has_attr('id'):
1659                        tag['id'] = __appname__+'_id_'+str(self.id_counter)
1660                        self.id_counter += 1
1661
1662                    tkey = self.target_prefix+tag['id']
1663                    if not new_block:
1664                        self.end_current_block()
1665                    self.current_block.must_append = True
1666                    self.targets[tkey] = self.current_block
1667                    if (self.book_designer and tag.get('class') in ('title', ['title'])):
1668                        self.extra_toc_entries.append((self.get_text(tag, 100), self.current_block))
1669
1670                src = self.get_text(tag, limit=1000)
1671
1672                if not self.disable_chapter_detection and tagname.startswith('h'):
1673                    if self.chapter_regex.search(src):
1674                        self.log.debug('Detected chapter %s'%src)
1675                        self.end_page()
1676                        self.page_break_found = True
1677
1678                        if self.options.add_chapters_to_toc:
1679                            self.current_block.must_append = True
1680                            self.extra_toc_entries.append((self.get_text(tag,
1681                                limit=1000), self.current_block))
1682
1683                if self.current_para.has_text():
1684                    self.current_para.append_to(self.current_block)
1685                self.current_para = Paragraph()
1686
1687                self.previous_text = '\n'
1688
1689                if not tag.contents:
1690                    self.current_block.append(CR())
1691                    return
1692
1693                if self.current_block.contents:
1694                    self.current_block.append(CR())
1695
1696                self.process_children(tag, tag_css, tag_pseudo_css)
1697
1698                if self.current_para.contents :
1699                    self.current_block.append(self.current_para)
1700                self.current_para = Paragraph()
1701                if tagname.startswith('h') or self.blank_after_para:
1702                    self.current_block.append(CR())
1703            elif tagname in ['b', 'strong', 'i', 'em', 'span', 'tt', 'big', 'code', 'cite', 'sup', 'sub']:
1704                self.process_children(tag, tag_css, tag_pseudo_css)
1705            elif tagname == 'font':
1706                if tag.has_attr('face'):
1707                    tag_css['font-family'] = tag['face']
1708                if tag.has_attr('color'):
1709                    tag_css['color'] = tag['color']
1710                self.process_children(tag, tag_css, tag_pseudo_css)
1711            elif tagname in ['br']:
1712                self.line_break()
1713                self.previous_text = '\n'
1714            elif tagname in ['hr', 'tr']:  # tr needed for nested tables
1715                self.end_current_block()
1716                if tagname == 'hr' and not tag_css.get('width', '').strip().startswith('0'):
1717                    self.current_page.RuledLine(linelength=int(self.current_page.pageStyle.attrs['textwidth']))
1718                self.previous_text = '\n'
1719                self.process_children(tag, tag_css, tag_pseudo_css)
1720            elif tagname == 'td':  # Needed for nested tables
1721                if not self.in_table:
1722                    self.current_para.append(' ')
1723                    self.previous_text = ' '
1724                self.process_children(tag, tag_css, tag_pseudo_css)
1725            elif tagname == 'table' and not self.ignore_tables and not self.in_table:
1726                tag_css = self.tag_css(tag)[0]  # Table should not inherit CSS
1727                try:
1728                    self.process_table(tag, tag_css)
1729                except Exception as err:
1730                    self.log.warning(_('An error occurred while processing a table: %s. Ignoring table markup.')%repr(err))
1731                    self.log.exception('')
1732                    self.log.debug(_('Bad table:\n%s')%str(tag)[:300])
1733                    self.in_table = False
1734                    self.process_children(tag, tag_css, tag_pseudo_css)
1735                finally:
1736                    if self.minimize_memory_usage:
1737                        tag.extract()
1738            else:
1739                self.process_children(tag, tag_css, tag_pseudo_css)
1740        finally:
1741            if end_page:
1742                self.end_page()
1743
1744    def process_table(self, tag, tag_css):
1745        self.end_current_block()
1746        self.current_block = self.book.create_text_block()
1747        rowpad = 10
1748        table = Table(self, tag, tag_css, rowpad=rowpad, colpad=10)
1749        canvases = []
1750        ps = self.current_page.pageStyle.attrs
1751        for block, xpos, ypos, delta, targets in table.blocks(int(ps['textwidth']), int(ps['textheight'])):
1752            if not block:
1753                if ypos > int(ps['textheight']):
1754                    raise Exception(_('Table has cell that is too large'))
1755                canvases.append(Canvas(int(self.current_page.pageStyle.attrs['textwidth']), ypos+rowpad,
1756                        blockrule='block-fixed'))
1757                for name in targets:
1758                    self.targets[self.target_prefix+name] = canvases[-1]
1759            else:
1760                if xpos > 65535:
1761                    xpos = 65535
1762                canvases[-1].put_object(block, xpos + int(delta/2), ypos)
1763
1764        for canvas in canvases:
1765            self.current_page.append(canvas)
1766        self.end_current_block()
1767
1768    def remove_unused_target_blocks(self):
1769        for block in self.unused_target_blocks:
1770            block.parent.contents.remove(block)
1771            block.parent = None
1772
1773    def writeto(self, path, lrs=False):
1774        self.remove_unused_target_blocks()
1775        self.book.renderLrs(path) if lrs else self.book.renderLrf(path)
1776
1777    def cleanup(self):
1778        for _file in chain(itervalues(self.scaled_images), itervalues(self.rotated_images)):
1779            _file.__del__()
1780
1781
1782def process_file(path, options, logger):
1783    path = os.path.abspath(path)
1784    default_title = force_unicode(os.path.splitext(os.path.basename(path))[0], filesystem_encoding)
1785    dirpath = os.path.dirname(path)
1786
1787    tpath = ''
1788    try_opf(path, options, logger)
1789    if getattr(options, 'cover', None):
1790        options.cover = os.path.expanduser(options.cover)
1791        if not os.path.isabs(options.cover):
1792            options.cover = os.path.join(dirpath, options.cover)
1793        if os.access(options.cover, os.R_OK):
1794            th = Device.THUMBNAIL_HEIGHT
1795            im = PILImage.open(options.cover)
1796            pwidth, pheight = options.profile.screen_width, \
1797                              options.profile.screen_height - options.profile.fudge
1798            width, height = im.size
1799            if width < pwidth:
1800                corrf = pwidth/width
1801                width, height = pwidth, int(corrf*height)
1802
1803            scaled, width, height = fit_image(width, height, pwidth, pheight)
1804            try:
1805                cim = im.resize((width, height), PILImage.BICUBIC).convert('RGB') if \
1806                      scaled else im
1807                cf = PersistentTemporaryFile(prefix=__appname__+"_", suffix=".jpg")
1808                cf.close()
1809                cim.convert('RGB').save(cf.name)
1810                options.cover = cf.name
1811
1812                tim = im.resize((int(0.75*th), th), PILImage.ANTIALIAS).convert('RGB')
1813                tf = PersistentTemporaryFile(prefix=__appname__+'_', suffix=".jpg")
1814                tf.close()
1815                tim.save(tf.name)
1816                tpath = tf.name
1817            except OSError as err:  # PIL sometimes fails, for example on interlaced PNG files
1818                logger.warn(_('Could not read cover image: %s'), err)
1819                options.cover = None
1820        else:
1821            raise ConversionError(_('Cannot read from: %s')% (options.cover,))
1822
1823    if not options.title:
1824        options.title = default_title
1825
1826    for prop in ('author', 'author_sort', 'title', 'title_sort', 'publisher', 'freetext'):
1827        val = getattr(options, prop, None)
1828        if val and not isinstance(val, str):
1829            soup = BeautifulSoup(val)
1830            setattr(options, prop, str(soup))
1831
1832    title = (options.title, options.title_sort)
1833    author = (options.author, options.author_sort)
1834
1835    args = dict(font_delta=options.font_delta, title=title,
1836                author=author, sourceencoding='utf8',
1837                freetext=options.freetext, category=options.category,
1838                publisher=options.publisher,
1839                booksetting=BookSetting(dpi=10*options.profile.dpi,
1840                                        screenheight=options.profile.screen_height,
1841                                        screenwidth=options.profile.screen_width))
1842    if tpath:
1843        args['thumbnail'] = tpath
1844    header = None
1845    if options.header:
1846        header = Paragraph()
1847        fheader = options.headerformat
1848        if not options.title:
1849            options.title = _('Unknown')
1850        if not options.author:
1851            options.author = _('Unknown')
1852        if not fheader:
1853            fheader = "%t by %a"
1854        fheader = re.sub(r'(?<!%)%t', options.title, fheader)
1855        fheader = re.sub(r'(?<!%)%a', options.author, fheader)
1856        fheader = re.sub(r'%%a','%a',fheader)
1857        fheader = re.sub(r'%%t','%t',fheader)
1858        header.append(fheader + "  ")
1859    book, fonts = Book(options, logger, header=header, **args)
1860    le = re.compile(options.link_exclude) if options.link_exclude else \
1861         re.compile('$')
1862    pb = re.compile(options.page_break, re.IGNORECASE) if options.page_break else \
1863         re.compile('$')
1864    fpb = re.compile(options.force_page_break, re.IGNORECASE) if options.force_page_break else \
1865         re.compile('$')
1866    cq = options.chapter_attr.split(',')
1867    if len(cq) < 3:
1868        raise ValueError('The --chapter-attr setting must have 2 commas.')
1869    options.chapter_attr = [re.compile(cq[0], re.IGNORECASE), cq[1],
1870                            re.compile(cq[2], re.IGNORECASE)]
1871    options.force_page_break = fpb
1872    options.link_exclude = le
1873    options.page_break = pb
1874    if not isinstance(options.chapter_regex, str):
1875        options.chapter_regex = options.chapter_regex.decode(preferred_encoding)
1876    options.chapter_regex = re.compile(options.chapter_regex, re.IGNORECASE)
1877    fpba = options.force_page_break_attr.split(',')
1878    if len(fpba) != 3:
1879        fpba = ['$', '', '$']
1880    options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1],
1881                                     re.compile(fpba[2], re.IGNORECASE)]
1882    if not hasattr(options, 'anchor_ids'):
1883        options.anchor_ids = True
1884    files = options.spine if (options.use_spine and hasattr(options, 'spine')) else [path]
1885    conv = HTMLConverter(book, fonts, options, logger, files)
1886    if options.use_spine and hasattr(options, 'toc') and options.toc is not None:
1887        conv.create_toc(options.toc)
1888    oname = options.output
1889    if not oname:
1890        suffix = '.lrs' if options.lrs else '.lrf'
1891        name = os.path.splitext(os.path.basename(path))[0] + suffix
1892        oname = os.path.join(os.getcwd(), name)
1893    oname = os.path.abspath(os.path.expanduser(oname))
1894    conv.writeto(oname, lrs=options.lrs)
1895    conv.cleanup()
1896    return oname
1897
1898
1899def try_opf(path, options, logger):
1900    if hasattr(options, 'opf'):
1901        opf = options.opf
1902    else:
1903        files = glob.glob(os.path.join(os.path.dirname(path),'*'))
1904        opf = None
1905        for f in files:
1906            ext = f.rpartition('.')[-1].lower()
1907            if ext == 'opf':
1908                opf = f
1909                break
1910    if opf is None:
1911        return
1912
1913    dirpath = os.path.dirname(os.path.abspath(opf))
1914    from calibre.ebooks.metadata.opf2 import OPF as OPF2
1915    with open(opf, 'rb') as f:
1916        opf = OPF2(f, dirpath)
1917    try:
1918        title = opf.title
1919        if title and not getattr(options, 'title', None):
1920            options.title = title
1921        if getattr(options, 'author', 'Unknown') == 'Unknown':
1922            if opf.authors:
1923                options.author = ', '.join(opf.authors)
1924            if opf.author_sort:
1925                options.author_sort = opf.author_sort
1926        if options.publisher == 'Unknown':
1927            publisher = opf.publisher
1928            if publisher:
1929                options.publisher = publisher
1930        if not getattr(options, 'cover', None) or options.use_metadata_cover:
1931            orig_cover = getattr(options, 'cover', None)
1932            options.cover = None
1933            cover = opf.cover
1934            if cover:
1935                cover = cover.replace('/', os.sep)
1936                if not os.path.isabs(cover):
1937                    cover = os.path.join(dirpath, cover)
1938                if os.access(cover, os.R_OK):
1939                    try:
1940                        PILImage.open(cover)
1941                        options.cover = cover
1942                    except:
1943                        pass
1944            if not getattr(options, 'cover', None) and orig_cover is not None:
1945                options.cover = orig_cover
1946        if getattr(opf, 'spine', False):
1947            options.spine = [i.path for i in opf.spine if i.path]
1948        if not getattr(options, 'toc', None):
1949            options.toc   = opf.toc
1950    except Exception:
1951        logger.exception(_('Failed to process OPF file'))
1952