1#!/usr/local/bin/python3.8
2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3
4
5__license__   = 'GPL v3'
6__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
7__docformat__ = 'restructuredtext en'
8
9import sys, os, numbers
10from itertools import count
11
12from lxml import etree
13
14from calibre.utils.xml_parse import safe_xml_fromstring
15
16
17class Font:
18
19    def __init__(self, spec):
20        self.id = spec.get('id')
21        self.size = float(spec.get('size'))
22        self.color = spec.get('color')
23        self.family = spec.get('family')
24
25
26class Element:
27
28    def __init__(self):
29        self.starts_block = None
30        self.block_style = None
31
32    def __eq__(self, other):
33        return self.id == other.id
34
35    def __hash__(self):
36        return hash(self.id)
37
38
39class Image(Element):
40
41    def __init__(self, img, opts, log, idc):
42        Element.__init__(self)
43        self.opts, self.log = opts, log
44        self.id = next(idc)
45        self.top, self.left, self.width, self.height, self.iwidth, self.iheight = \
46          map(float, map(img.get, ('top', 'left', 'rwidth', 'rheight', 'iwidth',
47              'iheight')))
48        self.src = img.get('src')
49        self.bottom = self.top + self.height
50        self.right = self.left + self.width
51
52    def to_html(self):
53        return '<img src="%s" width="%dpx" height="%dpx"/>' % \
54                (self.src, int(self.width), int(self.height))
55
56    def dump(self, f):
57        f.write(self.to_html())
58        f.write('\n')
59
60
61class Text(Element):
62
63    def __init__(self, text, font_map, opts, log, idc):
64        Element.__init__(self)
65        self.id = next(idc)
66        self.opts, self.log = opts, log
67        self.font_map = font_map
68        self.top, self.left, self.width, self.height = map(float, map(text.get,
69            ('top', 'left', 'width', 'height')))
70        self.bottom  = self.top + self.height
71        self.right = self.left + self.width
72        self.font = self.font_map[text.get('font')]
73        self.font_size = self.font.size
74        self.color = self.font.color
75        self.font_family = self.font.family
76
77        text.tail = ''
78        self.text_as_string = etree.tostring(text, method='text',
79                encoding='unicode')
80        self.raw = text.text if text.text else ''
81        for x in text.iterchildren():
82            self.raw += etree.tostring(x, method='xml', encoding='unicode')
83        self.average_character_width = self.width/len(self.text_as_string)
84
85    def coalesce(self, other, page_number):
86        if self.opts.verbose > 2:
87            self.log.debug('Coalescing %r with %r on page %d'%(self.text_as_string,
88                other.text_as_string, page_number))
89        self.top = min(self.top, other.top)
90        self.right = other.right
91        self.width = self.right - self.left
92        self.bottom = max(self.bottom, other.bottom)
93        self.height = self.bottom - self.top
94        self.font_size = max(self.font_size, other.font_size)
95        self.font = other.font if self.font_size == other.font_size else other.font
96        self.text_as_string += other.text_as_string
97        self.raw += other.raw
98        self.average_character_width = (self.average_character_width +
99                other.average_character_width)/2.0
100
101    def to_html(self):
102        return self.raw
103
104    def dump(self, f):
105        f.write(self.to_html().encode('utf-8'))
106        f.write('\n')
107
108
109class FontSizeStats(dict):
110
111    def __init__(self, stats):
112        total = float(sum(stats.values()))
113        self.most_common_size, self.chars_at_most_common_size = -1, 0
114
115        for sz, chars in stats.items():
116            if chars >= self.chars_at_most_common_size:
117                self.most_common_size, self.chars_at_most_common_size = sz, chars
118            self[sz] = chars/total
119
120
121class Interval:
122
123    def __init__(self, left, right):
124        self.left, self.right = left, right
125        self.width = right - left
126
127    def intersection(self, other):
128        left = max(self.left, other.left)
129        right = min(self.right, other.right)
130        return Interval(left, right)
131
132    def centered_in(self, parent):
133        left = abs(self.left - parent.left)
134        right = abs(self.right - parent.right)
135        return abs(left-right) < 3
136
137    def __nonzero__(self):
138        return self.width > 0
139
140    def __eq__(self, other):
141        return self.left == other.left and self.right == other.right
142
143    def __hash__(self):
144        return hash('(%f,%f)'%self.left, self.right)
145
146
147class Column:
148
149    # A column contains an element is the element bulges out to
150    # the left or the right by at most HFUZZ*col width.
151    HFUZZ = 0.2
152
153    def __init__(self):
154        self.left = self.right = self.top = self.bottom = 0
155        self.width = self.height = 0
156        self.elements = []
157        self.average_line_separation = 0
158
159    def add(self, elem):
160        if elem in self.elements:
161            return
162        self.elements.append(elem)
163        self._post_add()
164
165    def prepend(self, elem):
166        if elem in self.elements:
167            return
168        self.elements.insert(0, elem)
169        self._post_add()
170
171    def _post_add(self):
172        self.elements.sort(key=lambda x: x.bottom)
173        self.top = self.elements[0].top
174        self.bottom = self.elements[-1].bottom
175        self.left, self.right = sys.maxsize, 0
176        for x in self:
177            self.left = min(self.left, x.left)
178            self.right = max(self.right, x.right)
179        self.width, self.height = self.right-self.left, self.bottom-self.top
180
181    def __iter__(self):
182        yield from self.elements
183
184    def __len__(self):
185        return len(self.elements)
186
187    def contains(self, elem):
188        return elem.left > self.left - self.HFUZZ*self.width and \
189               elem.right < self.right + self.HFUZZ*self.width
190
191    def collect_stats(self):
192        if len(self.elements) > 1:
193            gaps = [self.elements[i+1].top - self.elements[i].bottom for i in
194                    range(0, len(self.elements)-1)]
195            self.average_line_separation = sum(gaps)/len(gaps)
196        for i, elem in enumerate(self.elements):
197            left_margin = elem.left - self.left
198            elem.indent_fraction = left_margin/self.width
199            elem.width_fraction = elem.width/self.width
200            if i == 0:
201                elem.top_gap_ratio = None
202            else:
203                elem.top_gap_ratio = (self.elements[i-1].bottom -
204                        elem.top)/self.average_line_separation
205
206    def previous_element(self, idx):
207        if idx == 0:
208            return None
209        return self.elements[idx-1]
210
211    def dump(self, f, num):
212        f.write('******** Column %d\n\n'%num)
213        for elem in self.elements:
214            elem.dump(f)
215
216
217class Box(list):
218
219    def __init__(self, type='p'):
220        self.tag = type
221
222    def to_html(self):
223        ans = ['<%s>'%self.tag]
224        for elem in self:
225            if isinstance(elem, numbers.Integral):
226                ans.append('<a name="page_%d"/>'%elem)
227            else:
228                ans.append(elem.to_html()+' ')
229        ans.append('</%s>'%self.tag)
230        return ans
231
232
233class ImageBox(Box):
234
235    def __init__(self, img):
236        Box.__init__(self)
237        self.img = img
238
239    def to_html(self):
240        ans = ['<div style="text-align:center">']
241        ans.append(self.img.to_html())
242        if len(self) > 0:
243            ans.append('<br/>')
244            for elem in self:
245                if isinstance(elem, numbers.Integral):
246                    ans.append('<a name="page_%d"/>'%elem)
247                else:
248                    ans.append(elem.to_html()+' ')
249        ans.append('</div>')
250        return ans
251
252
253class Region:
254
255    def __init__(self, opts, log):
256        self.opts, self.log = opts, log
257        self.columns = []
258        self.top = self.bottom = self.left = self.right = self.width = self.height = 0
259
260    def add(self, columns):
261        if not self.columns:
262            for x in sorted(columns, key=lambda x: x.left):
263                self.columns.append(x)
264        else:
265            for i in range(len(columns)):
266                for elem in columns[i]:
267                    self.columns[i].add(elem)
268
269    def contains(self, columns):
270        # TODO: handle unbalanced columns
271        if not self.columns:
272            return True
273        if len(columns) != len(self.columns):
274            return False
275        for i in range(len(columns)):
276            c1, c2 = self.columns[i], columns[i]
277            x1 = Interval(c1.left, c1.right)
278            x2 = Interval(c2.left, c2.right)
279            intersection = x1.intersection(x2)
280            base = min(x1.width, x2.width)
281            if intersection.width/base < 0.6:
282                return False
283        return True
284
285    @property
286    def is_empty(self):
287        return len(self.columns) == 0
288
289    @property
290    def line_count(self):
291        max_lines = 0
292        for c in self.columns:
293            max_lines = max(max_lines, len(c))
294        return max_lines
295
296    @property
297    def is_small(self):
298        return self.line_count < 3
299
300    def absorb(self, singleton):
301
302        def most_suitable_column(elem):
303            mc, mw = None, 0
304            for c in self.columns:
305                i = Interval(c.left, c.right)
306                e = Interval(elem.left, elem.right)
307                w = i.intersection(e).width
308                if w > mw:
309                    mc, mw = c, w
310            if mc is None:
311                self.log.warn('No suitable column for singleton',
312                        elem.to_html())
313                mc = self.columns[0]
314            return mc
315
316        for c in singleton.columns:
317            for elem in c:
318                col = most_suitable_column(elem)
319                if self.opts.verbose > 3:
320                    idx = self.columns.index(col)
321                    self.log.debug('Absorbing singleton %s into column'%elem.to_html(),
322                            idx)
323                col.add(elem)
324
325    def collect_stats(self):
326        for column in self.columns:
327            column.collect_stats()
328        self.average_line_separation = sum(x.average_line_separation for x in
329            self.columns)/float(len(self.columns))
330
331    def __iter__(self):
332        yield from self.columns
333
334    def absorb_regions(self, regions, at):
335        for region in regions:
336            self.absorb_region(region, at)
337
338    def absorb_region(self, region, at):
339        if len(region.columns) <= len(self.columns):
340            for i in range(len(region.columns)):
341                src, dest = region.columns[i], self.columns[i]
342                if at != 'bottom':
343                    src = reversed(list(iter(src)))
344                for elem in src:
345                    func = dest.add if at == 'bottom' else dest.prepend
346                    func(elem)
347
348        else:
349            col_map = {}
350            for i, col in enumerate(region.columns):
351                max_overlap, max_overlap_index = 0, 0
352                for j, dcol in enumerate(self.columns):
353                    sint = Interval(col.left, col.right)
354                    dint = Interval(dcol.left, dcol.right)
355                    width = sint.intersection(dint).width
356                    if width > max_overlap:
357                        max_overlap = width
358                        max_overlap_index = j
359                col_map[i] = max_overlap_index
360            lines = max(map(len, region.columns))
361            if at == 'bottom':
362                lines = range(lines)
363            else:
364                lines = range(lines-1, -1, -1)
365            for i in lines:
366                for j, src in enumerate(region.columns):
367                    dest = self.columns[col_map[j]]
368                    if i < len(src):
369                        func = dest.add if at == 'bottom' else dest.prepend
370                        func(src.elements[i])
371
372    def dump(self, f):
373        f.write('############################################################\n')
374        f.write('########## Region (%d columns) ###############\n'%len(self.columns))
375        f.write('############################################################\n\n')
376        for i, col in enumerate(self.columns):
377            col.dump(f, i)
378
379    def linearize(self):
380        self.elements = []
381        for x in self.columns:
382            self.elements.extend(x)
383        self.boxes = [Box()]
384        for i, elem in enumerate(self.elements):
385            if isinstance(elem, Image):
386                self.boxes.append(ImageBox(elem))
387                img = Interval(elem.left, elem.right)
388                for j in range(i+1, len(self.elements)):
389                    t = self.elements[j]
390                    if not isinstance(t, Text):
391                        break
392                    ti = Interval(t.left, t.right)
393                    if not ti.centered_in(img):
394                        break
395                    self.boxes[-1].append(t)
396                self.boxes.append(Box())
397            else:
398                is_indented = False
399                if i+1 < len(self.elements):
400                    indent_diff = elem.indent_fraction - \
401                        self.elements[i+1].indent_fraction
402                    if indent_diff > 0.05:
403                        is_indented = True
404                if elem.top_gap_ratio > 1.2 or is_indented:
405                    self.boxes.append(Box())
406                self.boxes[-1].append(elem)
407
408
409class Page:
410
411    # Fraction of a character width that two strings have to be apart,
412    # for them to be considered part of the same text fragment
413    COALESCE_FACTOR = 0.5
414
415    # Fraction of text height that two strings' bottoms can differ by
416    # for them to be considered to be part of the same text fragment
417    LINE_FACTOR = 0.4
418
419    # Multiplies the average line height when determining row height
420    # of a particular element to detect columns.
421    YFUZZ = 1.5
422
423    def __init__(self, page, font_map, opts, log, idc):
424        self.opts, self.log = opts, log
425        self.font_map = font_map
426        self.number = int(page.get('number'))
427        self.width, self.height = map(float, map(page.get,
428            ('width', 'height')))
429        self.id = 'page%d'%self.number
430
431        self.texts = []
432        self.left_margin, self.right_margin = self.width, 0
433
434        for text in page.xpath('descendant::text'):
435            self.texts.append(Text(text, self.font_map, self.opts, self.log, idc))
436            text = self.texts[-1]
437            self.left_margin = min(text.left, self.left_margin)
438            self.right_margin = max(text.right, self.right_margin)
439
440        self.textwidth = self.right_margin - self.left_margin
441
442        self.font_size_stats = {}
443        self.average_text_height = 0
444        for t in self.texts:
445            if t.font_size not in self.font_size_stats:
446                self.font_size_stats[t.font_size] = 0
447            self.font_size_stats[t.font_size] += len(t.text_as_string)
448            self.average_text_height += t.height
449        if len(self.texts):
450            self.average_text_height /= len(self.texts)
451
452        self.font_size_stats = FontSizeStats(self.font_size_stats)
453
454        self.coalesce_fragments()
455
456        self.elements = list(self.texts)
457        for img in page.xpath('descendant::img'):
458            self.elements.append(Image(img, self.opts, self.log, idc))
459        self.elements.sort(key=lambda x: x.top)
460
461    def coalesce_fragments(self):
462
463        def find_match(frag):
464            for t in self.texts:
465                hdelta = t.left - frag.right
466                hoverlap = self.COALESCE_FACTOR * frag.average_character_width
467                if t is not frag and hdelta > -hoverlap and \
468                    hdelta < hoverlap and \
469                    abs(t.bottom - frag.bottom) < self.LINE_FACTOR*frag.height:
470                    return t
471
472        match_found = True
473        while match_found:
474            match_found, match = False, None
475            for frag in self.texts:
476                match = find_match(frag)
477                if match is not None:
478                    match_found = True
479                    frag.coalesce(match, self.number)
480                    break
481            if match is not None:
482                self.texts.remove(match)
483
484    def first_pass(self):
485        'Sort page into regions and columns'
486        self.regions = []
487        if not self.elements:
488            return
489        for i, x in enumerate(self.elements):
490            x.idx = i
491        current_region = Region(self.opts, self.log)
492        processed = set()
493        for x in self.elements:
494            if x in processed:
495                continue
496            elems = set(self.find_elements_in_row_of(x))
497            columns = self.sort_into_columns(x, elems)
498            processed.update(elems)
499            if not current_region.contains(columns):
500                self.regions.append(current_region)
501                current_region = Region(self.opts, self.log)
502            current_region.add(columns)
503        if not current_region.is_empty:
504            self.regions.append(current_region)
505
506        if self.opts.verbose > 2:
507            self.debug_dir = 'page-%d'%self.number
508            os.mkdir(self.debug_dir)
509            self.dump_regions('pre-coalesce')
510
511        self.coalesce_regions()
512        self.dump_regions('post-coalesce')
513
514    def dump_regions(self, fname):
515        fname = 'regions-'+fname+'.txt'
516        with open(os.path.join(self.debug_dir, fname), 'wb') as f:
517            f.write('Page #%d\n\n'%self.number)
518            for region in self.regions:
519                region.dump(f)
520
521    def coalesce_regions(self):
522        # find contiguous sets of small regions
523        # absorb into a neighboring region (prefer the one with number of cols
524        # closer to the avg number of cols in the set, if equal use larger
525        # region)
526        found = True
527        absorbed = set()
528        processed = set()
529        while found:
530            found = False
531            for i, region in enumerate(self.regions):
532                if region in absorbed:
533                    continue
534                if region.is_small and region not in processed:
535                    found = True
536                    processed.add(region)
537                    regions = [region]
538                    end = i+1
539                    for j in range(i+1, len(self.regions)):
540                        end = j
541                        if self.regions[j].is_small:
542                            regions.append(self.regions[j])
543                        else:
544                            break
545                    prev_region = None if i == 0 else i-1
546                    next_region = end if end < len(self.regions) and self.regions[end] not in regions else None
547                    absorb_at = 'bottom'
548                    if prev_region is None and next_region is not None:
549                        absorb_into = next_region
550                        absorb_at = 'top'
551                    elif next_region is None and prev_region is not None:
552                        absorb_into = prev_region
553                    elif prev_region is None and next_region is None:
554                        if len(regions) > 1:
555                            absorb_into = i
556                            regions = regions[1:]
557                        else:
558                            absorb_into = None
559                    else:
560                        absorb_into = prev_region
561                        if self.regions[next_region].line_count >= \
562                                self.regions[prev_region].line_count:
563                            avg_column_count = sum(len(r.columns) for r in
564                                regions)/float(len(regions))
565                            if self.regions[next_region].line_count > \
566                                    self.regions[prev_region].line_count \
567                               or abs(avg_column_count -
568                                       len(self.regions[prev_region].columns)) \
569                               > abs(avg_column_count -
570                                       len(self.regions[next_region].columns)):
571                                absorb_into = next_region
572                                absorb_at = 'top'
573                    if absorb_into is not None:
574                        self.regions[absorb_into].absorb_regions(regions, absorb_at)
575                        absorbed.update(regions)
576        for region in absorbed:
577            self.regions.remove(region)
578
579    def sort_into_columns(self, elem, neighbors):
580        neighbors.add(elem)
581        neighbors = sorted(neighbors, key=lambda x: x.left)
582        if self.opts.verbose > 3:
583            self.log.debug('Neighbors:', [x.to_html() for x in neighbors])
584        columns = [Column()]
585        columns[0].add(elem)
586        for x in neighbors:
587            added = False
588            for c in columns:
589                if c.contains(x):
590                    c.add(x)
591                    added = True
592                    break
593            if not added:
594                columns.append(Column())
595                columns[-1].add(x)
596                columns.sort(key=lambda x: x.left)
597        return columns
598
599    def find_elements_in_row_of(self, x):
600        interval = Interval(x.top,
601                x.top + self.YFUZZ*(self.average_text_height))
602        h_interval = Interval(x.left, x.right)
603        for y in self.elements[x.idx:x.idx+15]:
604            if y is not x:
605                y_interval = Interval(y.top, y.bottom)
606                x_interval = Interval(y.left, y.right)
607                if interval.intersection(y_interval).width > \
608                    0.5*self.average_text_height and \
609                    x_interval.intersection(h_interval).width <= 0:
610                    yield y
611
612    def second_pass(self):
613        'Locate paragraph boundaries in each column'
614        for region in self.regions:
615            region.collect_stats()
616            region.linearize()
617
618
619class PDFDocument:
620
621    def __init__(self, xml, opts, log):
622        self.opts, self.log = opts, log
623        self.root = safe_xml_fromstring(xml)
624        idc = count()
625
626        self.fonts = []
627        self.font_map = {}
628
629        for spec in self.root.xpath('//font'):
630            self.fonts.append(Font(spec))
631            self.font_map[self.fonts[-1].id] = self.fonts[-1]
632
633        self.pages = []
634        self.page_map = {}
635
636        for page in self.root.xpath('//page'):
637            page = Page(page, self.font_map, opts, log, idc)
638            self.page_map[page.id] = page
639            self.pages.append(page)
640
641        self.collect_font_statistics()
642
643        for page in self.pages:
644            page.document_font_stats = self.font_size_stats
645            page.first_pass()
646            page.second_pass()
647
648        self.linearize()
649        self.render()
650
651    def collect_font_statistics(self):
652        self.font_size_stats = {}
653        for p in self.pages:
654            for sz in p.font_size_stats:
655                chars = p.font_size_stats[sz]
656                if sz not in self.font_size_stats:
657                    self.font_size_stats[sz] = 0
658                self.font_size_stats[sz] += chars
659
660        self.font_size_stats = FontSizeStats(self.font_size_stats)
661
662    def linearize(self):
663        self.elements = []
664        last_region = last_block = None
665        for page in self.pages:
666            page_number_inserted = False
667            for region in page.regions:
668                merge_first_block = last_region is not None and \
669                    len(last_region.columns) == len(region.columns) and \
670                    not hasattr(last_block, 'img')
671                for i, block in enumerate(region.boxes):
672                    if merge_first_block:
673                        merge_first_block = False
674                        if not page_number_inserted:
675                            last_block.append(page.number)
676                            page_number_inserted = True
677                        for elem in block:
678                            last_block.append(elem)
679                    else:
680                        if not page_number_inserted:
681                            block.insert(0, page.number)
682                            page_number_inserted = True
683                        self.elements.append(block)
684                    last_block = block
685                last_region = region
686
687    def render(self):
688        html = ['<?xml version="1.0" encoding="UTF-8"?>',
689                '<html xmlns="http://www.w3.org/1999/xhtml">', '<head>',
690                '<title>PDF Reflow conversion</title>', '</head>', '<body>',
691                '<div>']
692        for elem in self.elements:
693            html.extend(elem.to_html())
694        html += ['</body>', '</html>']
695        raw = ('\n'.join(html)).replace('</strong><strong>', '')
696        with open('index.html', 'wb') as f:
697            f.write(raw.encode('utf-8'))
698