1#!/usr/local/bin/python3.8 2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' 7__docformat__ = 'restructuredtext en' 8 9import sys, os, numbers 10from itertools import count 11 12from lxml import etree 13 14from calibre.utils.xml_parse import safe_xml_fromstring 15 16 17class Font: 18 19 def __init__(self, spec): 20 self.id = spec.get('id') 21 self.size = float(spec.get('size')) 22 self.color = spec.get('color') 23 self.family = spec.get('family') 24 25 26class Element: 27 28 def __init__(self): 29 self.starts_block = None 30 self.block_style = None 31 32 def __eq__(self, other): 33 return self.id == other.id 34 35 def __hash__(self): 36 return hash(self.id) 37 38 39class Image(Element): 40 41 def __init__(self, img, opts, log, idc): 42 Element.__init__(self) 43 self.opts, self.log = opts, log 44 self.id = next(idc) 45 self.top, self.left, self.width, self.height, self.iwidth, self.iheight = \ 46 map(float, map(img.get, ('top', 'left', 'rwidth', 'rheight', 'iwidth', 47 'iheight'))) 48 self.src = img.get('src') 49 self.bottom = self.top + self.height 50 self.right = self.left + self.width 51 52 def to_html(self): 53 return '<img src="%s" width="%dpx" height="%dpx"/>' % \ 54 (self.src, int(self.width), int(self.height)) 55 56 def dump(self, f): 57 f.write(self.to_html()) 58 f.write('\n') 59 60 61class Text(Element): 62 63 def __init__(self, text, font_map, opts, log, idc): 64 Element.__init__(self) 65 self.id = next(idc) 66 self.opts, self.log = opts, log 67 self.font_map = font_map 68 self.top, self.left, self.width, self.height = map(float, map(text.get, 69 ('top', 'left', 'width', 'height'))) 70 self.bottom = self.top + self.height 71 self.right = self.left + self.width 72 self.font = self.font_map[text.get('font')] 73 self.font_size = self.font.size 74 self.color = self.font.color 75 self.font_family = self.font.family 76 77 text.tail = '' 78 self.text_as_string = etree.tostring(text, method='text', 79 encoding='unicode') 80 self.raw = text.text if text.text else '' 81 for x in text.iterchildren(): 82 self.raw += etree.tostring(x, method='xml', encoding='unicode') 83 self.average_character_width = self.width/len(self.text_as_string) 84 85 def coalesce(self, other, page_number): 86 if self.opts.verbose > 2: 87 self.log.debug('Coalescing %r with %r on page %d'%(self.text_as_string, 88 other.text_as_string, page_number)) 89 self.top = min(self.top, other.top) 90 self.right = other.right 91 self.width = self.right - self.left 92 self.bottom = max(self.bottom, other.bottom) 93 self.height = self.bottom - self.top 94 self.font_size = max(self.font_size, other.font_size) 95 self.font = other.font if self.font_size == other.font_size else other.font 96 self.text_as_string += other.text_as_string 97 self.raw += other.raw 98 self.average_character_width = (self.average_character_width + 99 other.average_character_width)/2.0 100 101 def to_html(self): 102 return self.raw 103 104 def dump(self, f): 105 f.write(self.to_html().encode('utf-8')) 106 f.write('\n') 107 108 109class FontSizeStats(dict): 110 111 def __init__(self, stats): 112 total = float(sum(stats.values())) 113 self.most_common_size, self.chars_at_most_common_size = -1, 0 114 115 for sz, chars in stats.items(): 116 if chars >= self.chars_at_most_common_size: 117 self.most_common_size, self.chars_at_most_common_size = sz, chars 118 self[sz] = chars/total 119 120 121class Interval: 122 123 def __init__(self, left, right): 124 self.left, self.right = left, right 125 self.width = right - left 126 127 def intersection(self, other): 128 left = max(self.left, other.left) 129 right = min(self.right, other.right) 130 return Interval(left, right) 131 132 def centered_in(self, parent): 133 left = abs(self.left - parent.left) 134 right = abs(self.right - parent.right) 135 return abs(left-right) < 3 136 137 def __nonzero__(self): 138 return self.width > 0 139 140 def __eq__(self, other): 141 return self.left == other.left and self.right == other.right 142 143 def __hash__(self): 144 return hash('(%f,%f)'%self.left, self.right) 145 146 147class Column: 148 149 # A column contains an element is the element bulges out to 150 # the left or the right by at most HFUZZ*col width. 151 HFUZZ = 0.2 152 153 def __init__(self): 154 self.left = self.right = self.top = self.bottom = 0 155 self.width = self.height = 0 156 self.elements = [] 157 self.average_line_separation = 0 158 159 def add(self, elem): 160 if elem in self.elements: 161 return 162 self.elements.append(elem) 163 self._post_add() 164 165 def prepend(self, elem): 166 if elem in self.elements: 167 return 168 self.elements.insert(0, elem) 169 self._post_add() 170 171 def _post_add(self): 172 self.elements.sort(key=lambda x: x.bottom) 173 self.top = self.elements[0].top 174 self.bottom = self.elements[-1].bottom 175 self.left, self.right = sys.maxsize, 0 176 for x in self: 177 self.left = min(self.left, x.left) 178 self.right = max(self.right, x.right) 179 self.width, self.height = self.right-self.left, self.bottom-self.top 180 181 def __iter__(self): 182 yield from self.elements 183 184 def __len__(self): 185 return len(self.elements) 186 187 def contains(self, elem): 188 return elem.left > self.left - self.HFUZZ*self.width and \ 189 elem.right < self.right + self.HFUZZ*self.width 190 191 def collect_stats(self): 192 if len(self.elements) > 1: 193 gaps = [self.elements[i+1].top - self.elements[i].bottom for i in 194 range(0, len(self.elements)-1)] 195 self.average_line_separation = sum(gaps)/len(gaps) 196 for i, elem in enumerate(self.elements): 197 left_margin = elem.left - self.left 198 elem.indent_fraction = left_margin/self.width 199 elem.width_fraction = elem.width/self.width 200 if i == 0: 201 elem.top_gap_ratio = None 202 else: 203 elem.top_gap_ratio = (self.elements[i-1].bottom - 204 elem.top)/self.average_line_separation 205 206 def previous_element(self, idx): 207 if idx == 0: 208 return None 209 return self.elements[idx-1] 210 211 def dump(self, f, num): 212 f.write('******** Column %d\n\n'%num) 213 for elem in self.elements: 214 elem.dump(f) 215 216 217class Box(list): 218 219 def __init__(self, type='p'): 220 self.tag = type 221 222 def to_html(self): 223 ans = ['<%s>'%self.tag] 224 for elem in self: 225 if isinstance(elem, numbers.Integral): 226 ans.append('<a name="page_%d"/>'%elem) 227 else: 228 ans.append(elem.to_html()+' ') 229 ans.append('</%s>'%self.tag) 230 return ans 231 232 233class ImageBox(Box): 234 235 def __init__(self, img): 236 Box.__init__(self) 237 self.img = img 238 239 def to_html(self): 240 ans = ['<div style="text-align:center">'] 241 ans.append(self.img.to_html()) 242 if len(self) > 0: 243 ans.append('<br/>') 244 for elem in self: 245 if isinstance(elem, numbers.Integral): 246 ans.append('<a name="page_%d"/>'%elem) 247 else: 248 ans.append(elem.to_html()+' ') 249 ans.append('</div>') 250 return ans 251 252 253class Region: 254 255 def __init__(self, opts, log): 256 self.opts, self.log = opts, log 257 self.columns = [] 258 self.top = self.bottom = self.left = self.right = self.width = self.height = 0 259 260 def add(self, columns): 261 if not self.columns: 262 for x in sorted(columns, key=lambda x: x.left): 263 self.columns.append(x) 264 else: 265 for i in range(len(columns)): 266 for elem in columns[i]: 267 self.columns[i].add(elem) 268 269 def contains(self, columns): 270 # TODO: handle unbalanced columns 271 if not self.columns: 272 return True 273 if len(columns) != len(self.columns): 274 return False 275 for i in range(len(columns)): 276 c1, c2 = self.columns[i], columns[i] 277 x1 = Interval(c1.left, c1.right) 278 x2 = Interval(c2.left, c2.right) 279 intersection = x1.intersection(x2) 280 base = min(x1.width, x2.width) 281 if intersection.width/base < 0.6: 282 return False 283 return True 284 285 @property 286 def is_empty(self): 287 return len(self.columns) == 0 288 289 @property 290 def line_count(self): 291 max_lines = 0 292 for c in self.columns: 293 max_lines = max(max_lines, len(c)) 294 return max_lines 295 296 @property 297 def is_small(self): 298 return self.line_count < 3 299 300 def absorb(self, singleton): 301 302 def most_suitable_column(elem): 303 mc, mw = None, 0 304 for c in self.columns: 305 i = Interval(c.left, c.right) 306 e = Interval(elem.left, elem.right) 307 w = i.intersection(e).width 308 if w > mw: 309 mc, mw = c, w 310 if mc is None: 311 self.log.warn('No suitable column for singleton', 312 elem.to_html()) 313 mc = self.columns[0] 314 return mc 315 316 for c in singleton.columns: 317 for elem in c: 318 col = most_suitable_column(elem) 319 if self.opts.verbose > 3: 320 idx = self.columns.index(col) 321 self.log.debug('Absorbing singleton %s into column'%elem.to_html(), 322 idx) 323 col.add(elem) 324 325 def collect_stats(self): 326 for column in self.columns: 327 column.collect_stats() 328 self.average_line_separation = sum(x.average_line_separation for x in 329 self.columns)/float(len(self.columns)) 330 331 def __iter__(self): 332 yield from self.columns 333 334 def absorb_regions(self, regions, at): 335 for region in regions: 336 self.absorb_region(region, at) 337 338 def absorb_region(self, region, at): 339 if len(region.columns) <= len(self.columns): 340 for i in range(len(region.columns)): 341 src, dest = region.columns[i], self.columns[i] 342 if at != 'bottom': 343 src = reversed(list(iter(src))) 344 for elem in src: 345 func = dest.add if at == 'bottom' else dest.prepend 346 func(elem) 347 348 else: 349 col_map = {} 350 for i, col in enumerate(region.columns): 351 max_overlap, max_overlap_index = 0, 0 352 for j, dcol in enumerate(self.columns): 353 sint = Interval(col.left, col.right) 354 dint = Interval(dcol.left, dcol.right) 355 width = sint.intersection(dint).width 356 if width > max_overlap: 357 max_overlap = width 358 max_overlap_index = j 359 col_map[i] = max_overlap_index 360 lines = max(map(len, region.columns)) 361 if at == 'bottom': 362 lines = range(lines) 363 else: 364 lines = range(lines-1, -1, -1) 365 for i in lines: 366 for j, src in enumerate(region.columns): 367 dest = self.columns[col_map[j]] 368 if i < len(src): 369 func = dest.add if at == 'bottom' else dest.prepend 370 func(src.elements[i]) 371 372 def dump(self, f): 373 f.write('############################################################\n') 374 f.write('########## Region (%d columns) ###############\n'%len(self.columns)) 375 f.write('############################################################\n\n') 376 for i, col in enumerate(self.columns): 377 col.dump(f, i) 378 379 def linearize(self): 380 self.elements = [] 381 for x in self.columns: 382 self.elements.extend(x) 383 self.boxes = [Box()] 384 for i, elem in enumerate(self.elements): 385 if isinstance(elem, Image): 386 self.boxes.append(ImageBox(elem)) 387 img = Interval(elem.left, elem.right) 388 for j in range(i+1, len(self.elements)): 389 t = self.elements[j] 390 if not isinstance(t, Text): 391 break 392 ti = Interval(t.left, t.right) 393 if not ti.centered_in(img): 394 break 395 self.boxes[-1].append(t) 396 self.boxes.append(Box()) 397 else: 398 is_indented = False 399 if i+1 < len(self.elements): 400 indent_diff = elem.indent_fraction - \ 401 self.elements[i+1].indent_fraction 402 if indent_diff > 0.05: 403 is_indented = True 404 if elem.top_gap_ratio > 1.2 or is_indented: 405 self.boxes.append(Box()) 406 self.boxes[-1].append(elem) 407 408 409class Page: 410 411 # Fraction of a character width that two strings have to be apart, 412 # for them to be considered part of the same text fragment 413 COALESCE_FACTOR = 0.5 414 415 # Fraction of text height that two strings' bottoms can differ by 416 # for them to be considered to be part of the same text fragment 417 LINE_FACTOR = 0.4 418 419 # Multiplies the average line height when determining row height 420 # of a particular element to detect columns. 421 YFUZZ = 1.5 422 423 def __init__(self, page, font_map, opts, log, idc): 424 self.opts, self.log = opts, log 425 self.font_map = font_map 426 self.number = int(page.get('number')) 427 self.width, self.height = map(float, map(page.get, 428 ('width', 'height'))) 429 self.id = 'page%d'%self.number 430 431 self.texts = [] 432 self.left_margin, self.right_margin = self.width, 0 433 434 for text in page.xpath('descendant::text'): 435 self.texts.append(Text(text, self.font_map, self.opts, self.log, idc)) 436 text = self.texts[-1] 437 self.left_margin = min(text.left, self.left_margin) 438 self.right_margin = max(text.right, self.right_margin) 439 440 self.textwidth = self.right_margin - self.left_margin 441 442 self.font_size_stats = {} 443 self.average_text_height = 0 444 for t in self.texts: 445 if t.font_size not in self.font_size_stats: 446 self.font_size_stats[t.font_size] = 0 447 self.font_size_stats[t.font_size] += len(t.text_as_string) 448 self.average_text_height += t.height 449 if len(self.texts): 450 self.average_text_height /= len(self.texts) 451 452 self.font_size_stats = FontSizeStats(self.font_size_stats) 453 454 self.coalesce_fragments() 455 456 self.elements = list(self.texts) 457 for img in page.xpath('descendant::img'): 458 self.elements.append(Image(img, self.opts, self.log, idc)) 459 self.elements.sort(key=lambda x: x.top) 460 461 def coalesce_fragments(self): 462 463 def find_match(frag): 464 for t in self.texts: 465 hdelta = t.left - frag.right 466 hoverlap = self.COALESCE_FACTOR * frag.average_character_width 467 if t is not frag and hdelta > -hoverlap and \ 468 hdelta < hoverlap and \ 469 abs(t.bottom - frag.bottom) < self.LINE_FACTOR*frag.height: 470 return t 471 472 match_found = True 473 while match_found: 474 match_found, match = False, None 475 for frag in self.texts: 476 match = find_match(frag) 477 if match is not None: 478 match_found = True 479 frag.coalesce(match, self.number) 480 break 481 if match is not None: 482 self.texts.remove(match) 483 484 def first_pass(self): 485 'Sort page into regions and columns' 486 self.regions = [] 487 if not self.elements: 488 return 489 for i, x in enumerate(self.elements): 490 x.idx = i 491 current_region = Region(self.opts, self.log) 492 processed = set() 493 for x in self.elements: 494 if x in processed: 495 continue 496 elems = set(self.find_elements_in_row_of(x)) 497 columns = self.sort_into_columns(x, elems) 498 processed.update(elems) 499 if not current_region.contains(columns): 500 self.regions.append(current_region) 501 current_region = Region(self.opts, self.log) 502 current_region.add(columns) 503 if not current_region.is_empty: 504 self.regions.append(current_region) 505 506 if self.opts.verbose > 2: 507 self.debug_dir = 'page-%d'%self.number 508 os.mkdir(self.debug_dir) 509 self.dump_regions('pre-coalesce') 510 511 self.coalesce_regions() 512 self.dump_regions('post-coalesce') 513 514 def dump_regions(self, fname): 515 fname = 'regions-'+fname+'.txt' 516 with open(os.path.join(self.debug_dir, fname), 'wb') as f: 517 f.write('Page #%d\n\n'%self.number) 518 for region in self.regions: 519 region.dump(f) 520 521 def coalesce_regions(self): 522 # find contiguous sets of small regions 523 # absorb into a neighboring region (prefer the one with number of cols 524 # closer to the avg number of cols in the set, if equal use larger 525 # region) 526 found = True 527 absorbed = set() 528 processed = set() 529 while found: 530 found = False 531 for i, region in enumerate(self.regions): 532 if region in absorbed: 533 continue 534 if region.is_small and region not in processed: 535 found = True 536 processed.add(region) 537 regions = [region] 538 end = i+1 539 for j in range(i+1, len(self.regions)): 540 end = j 541 if self.regions[j].is_small: 542 regions.append(self.regions[j]) 543 else: 544 break 545 prev_region = None if i == 0 else i-1 546 next_region = end if end < len(self.regions) and self.regions[end] not in regions else None 547 absorb_at = 'bottom' 548 if prev_region is None and next_region is not None: 549 absorb_into = next_region 550 absorb_at = 'top' 551 elif next_region is None and prev_region is not None: 552 absorb_into = prev_region 553 elif prev_region is None and next_region is None: 554 if len(regions) > 1: 555 absorb_into = i 556 regions = regions[1:] 557 else: 558 absorb_into = None 559 else: 560 absorb_into = prev_region 561 if self.regions[next_region].line_count >= \ 562 self.regions[prev_region].line_count: 563 avg_column_count = sum(len(r.columns) for r in 564 regions)/float(len(regions)) 565 if self.regions[next_region].line_count > \ 566 self.regions[prev_region].line_count \ 567 or abs(avg_column_count - 568 len(self.regions[prev_region].columns)) \ 569 > abs(avg_column_count - 570 len(self.regions[next_region].columns)): 571 absorb_into = next_region 572 absorb_at = 'top' 573 if absorb_into is not None: 574 self.regions[absorb_into].absorb_regions(regions, absorb_at) 575 absorbed.update(regions) 576 for region in absorbed: 577 self.regions.remove(region) 578 579 def sort_into_columns(self, elem, neighbors): 580 neighbors.add(elem) 581 neighbors = sorted(neighbors, key=lambda x: x.left) 582 if self.opts.verbose > 3: 583 self.log.debug('Neighbors:', [x.to_html() for x in neighbors]) 584 columns = [Column()] 585 columns[0].add(elem) 586 for x in neighbors: 587 added = False 588 for c in columns: 589 if c.contains(x): 590 c.add(x) 591 added = True 592 break 593 if not added: 594 columns.append(Column()) 595 columns[-1].add(x) 596 columns.sort(key=lambda x: x.left) 597 return columns 598 599 def find_elements_in_row_of(self, x): 600 interval = Interval(x.top, 601 x.top + self.YFUZZ*(self.average_text_height)) 602 h_interval = Interval(x.left, x.right) 603 for y in self.elements[x.idx:x.idx+15]: 604 if y is not x: 605 y_interval = Interval(y.top, y.bottom) 606 x_interval = Interval(y.left, y.right) 607 if interval.intersection(y_interval).width > \ 608 0.5*self.average_text_height and \ 609 x_interval.intersection(h_interval).width <= 0: 610 yield y 611 612 def second_pass(self): 613 'Locate paragraph boundaries in each column' 614 for region in self.regions: 615 region.collect_stats() 616 region.linearize() 617 618 619class PDFDocument: 620 621 def __init__(self, xml, opts, log): 622 self.opts, self.log = opts, log 623 self.root = safe_xml_fromstring(xml) 624 idc = count() 625 626 self.fonts = [] 627 self.font_map = {} 628 629 for spec in self.root.xpath('//font'): 630 self.fonts.append(Font(spec)) 631 self.font_map[self.fonts[-1].id] = self.fonts[-1] 632 633 self.pages = [] 634 self.page_map = {} 635 636 for page in self.root.xpath('//page'): 637 page = Page(page, self.font_map, opts, log, idc) 638 self.page_map[page.id] = page 639 self.pages.append(page) 640 641 self.collect_font_statistics() 642 643 for page in self.pages: 644 page.document_font_stats = self.font_size_stats 645 page.first_pass() 646 page.second_pass() 647 648 self.linearize() 649 self.render() 650 651 def collect_font_statistics(self): 652 self.font_size_stats = {} 653 for p in self.pages: 654 for sz in p.font_size_stats: 655 chars = p.font_size_stats[sz] 656 if sz not in self.font_size_stats: 657 self.font_size_stats[sz] = 0 658 self.font_size_stats[sz] += chars 659 660 self.font_size_stats = FontSizeStats(self.font_size_stats) 661 662 def linearize(self): 663 self.elements = [] 664 last_region = last_block = None 665 for page in self.pages: 666 page_number_inserted = False 667 for region in page.regions: 668 merge_first_block = last_region is not None and \ 669 len(last_region.columns) == len(region.columns) and \ 670 not hasattr(last_block, 'img') 671 for i, block in enumerate(region.boxes): 672 if merge_first_block: 673 merge_first_block = False 674 if not page_number_inserted: 675 last_block.append(page.number) 676 page_number_inserted = True 677 for elem in block: 678 last_block.append(elem) 679 else: 680 if not page_number_inserted: 681 block.insert(0, page.number) 682 page_number_inserted = True 683 self.elements.append(block) 684 last_block = block 685 last_region = region 686 687 def render(self): 688 html = ['<?xml version="1.0" encoding="UTF-8"?>', 689 '<html xmlns="http://www.w3.org/1999/xhtml">', '<head>', 690 '<title>PDF Reflow conversion</title>', '</head>', '<body>', 691 '<div>'] 692 for elem in self.elements: 693 html.extend(elem.to_html()) 694 html += ['</body>', '</html>'] 695 raw = ('\n'.join(html)).replace('</strong><strong>', '') 696 with open('index.html', 'wb') as f: 697 f.write(raw.encode('utf-8')) 698