1#!/usr/local/bin/python3.8 2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' 7__docformat__ = 'restructuredtext en' 8 9import numbers 10from struct import pack 11import io 12from collections import OrderedDict, defaultdict 13 14from calibre.ebooks.mobi.utils import (encint, encode_number_as_hex, 15 encode_tbs, align_block, RECORD_SIZE, CNCX as CNCX_) 16from polyglot.builtins import iteritems, itervalues 17 18 19class CNCX(CNCX_): # {{{ 20 21 def __init__(self, toc, is_periodical): 22 strings = [] 23 for item in toc.iterdescendants(breadth_first=True): 24 strings.append(item.title) 25 if is_periodical: 26 strings.append(item.klass) 27 if item.author: 28 strings.append(item.author) 29 if item.description: 30 strings.append(item.description) 31 CNCX_.__init__(self, strings) 32# }}} 33 34 35class TAGX: # {{{ 36 37 BITMASKS = {11:0b1} 38 BITMASKS.update({x:(1 << i) for i, x in enumerate([1, 2, 3, 4, 5, 21, 22, 23])}) 39 BITMASKS.update({x:(1 << i) for i, x in enumerate([69, 70, 71, 72, 73])}) 40 41 NUM_VALUES = defaultdict(lambda :1) 42 NUM_VALUES[11] = 3 43 NUM_VALUES[0] = 0 44 45 def __init__(self): 46 self.byts = bytearray() 47 48 def add_tag(self, tag): 49 buf = self.byts 50 buf.append(tag) 51 buf.append(self.NUM_VALUES[tag]) 52 # bitmask 53 buf.append(self.BITMASKS[tag] if tag else 0) 54 # eof 55 buf.append(0 if tag else 1) 56 57 def header(self, control_byte_count): 58 header = b'TAGX' 59 # table length, control byte count 60 header += pack(b'>II', 12+len(self.byts), control_byte_count) 61 return header 62 63 @property 64 def periodical(self): 65 ''' 66 TAGX block for the Primary index header of a periodical 67 ''' 68 for i in (1, 2, 3, 4, 5, 21, 22, 23, 0, 69, 70, 71, 72,73, 0): 69 self.add_tag(i) 70 return self.header(2) + bytes(self.byts) 71 72 @property 73 def secondary(self): 74 ''' 75 TAGX block for the secondary index header of a periodical 76 ''' 77 for i in (11, 0): 78 self.add_tag(i) 79 return self.header(1) + bytes(self.byts) 80 81 @property 82 def flat_book(self): 83 ''' 84 TAGX block for the primary index header of a flat book 85 ''' 86 for i in (1, 2, 3, 4, 0): 87 self.add_tag(i) 88 return self.header(1) + bytes(self.byts) 89 90 91# }}} 92 93# Index Entries {{{ 94 95class IndexEntry: 96 97 TAG_VALUES = { 98 'offset': 1, 99 'size': 2, 100 'label_offset': 3, 101 'depth': 4, 102 'class_offset': 5, 103 'secondary': 11, 104 'parent_index': 21, 105 'first_child_index': 22, 106 'last_child_index': 23, 107 'image_index': 69, 108 'desc_offset': 70, 109 'author_offset': 71, 110 111 } 112 RTAG_MAP = {v:k for k, v in iteritems(TAG_VALUES)} # noqa 113 114 def __init__(self, offset, label_offset): 115 self.offset, self.label_offset = offset, label_offset 116 self.depth, self.class_offset = 0, None 117 self.control_byte_count = 1 118 119 self.length = 0 120 self.index = 0 121 122 self.parent_index = None 123 self.first_child_index = None 124 self.last_child_index = None 125 126 self.image_index = None 127 self.author_offset = None 128 self.desc_offset = None 129 130 def __repr__(self): 131 return ('IndexEntry(offset=%r, depth=%r, length=%r, index=%r,' 132 ' parent_index=%r)')%(self.offset, self.depth, self.length, 133 self.index, self.parent_index) 134 135 @property 136 def size(self): 137 return self.length 138 139 @size.setter 140 def size(self, val): 141 self.length = val 142 143 @property 144 def next_offset(self): 145 return self.offset + self.length 146 147 @property 148 def tag_nums(self): 149 yield from range(1, 5) 150 for attr in ('class_offset', 'parent_index', 'first_child_index', 151 'last_child_index'): 152 if getattr(self, attr) is not None: 153 yield self.TAG_VALUES[attr] 154 155 @property 156 def entry_type(self): 157 ans = 0 158 for tag in self.tag_nums: 159 ans |= TAGX.BITMASKS[tag] 160 return ans 161 162 def attr_for_tag(self, tag): 163 return self.RTAG_MAP[tag] 164 165 @property 166 def bytestring(self): 167 buf = io.BytesIO() 168 if isinstance(self.index, numbers.Integral): 169 buf.write(encode_number_as_hex(self.index)) 170 else: 171 raw = bytearray(self.index.encode('ascii')) 172 raw.insert(0, len(raw)) 173 buf.write(bytes(raw)) 174 et = self.entry_type 175 buf.write(bytes(bytearray([et]))) 176 177 if self.control_byte_count == 2: 178 flags = 0 179 for attr in ('image_index', 'desc_offset', 'author_offset'): 180 val = getattr(self, attr) 181 if val is not None: 182 tag = self.TAG_VALUES[attr] 183 bm = TAGX.BITMASKS[tag] 184 flags |= bm 185 buf.write(bytes(bytearray([flags]))) 186 187 for tag in self.tag_nums: 188 attr = self.attr_for_tag(tag) 189 val = getattr(self, attr) 190 if isinstance(val, numbers.Integral): 191 val = [val] 192 for x in val: 193 buf.write(encint(x)) 194 195 if self.control_byte_count == 2: 196 for attr in ('image_index', 'desc_offset', 'author_offset'): 197 val = getattr(self, attr) 198 if val is not None: 199 buf.write(encint(val)) 200 201 ans = buf.getvalue() 202 return ans 203 204 205class PeriodicalIndexEntry(IndexEntry): 206 207 def __init__(self, offset, label_offset, class_offset, depth): 208 IndexEntry.__init__(self, offset, label_offset) 209 self.depth = depth 210 self.class_offset = class_offset 211 self.control_byte_count = 2 212 213 214class SecondaryIndexEntry(IndexEntry): 215 216 INDEX_MAP = {'author':73, 'caption':72, 'credit':71, 'description':70, 217 'mastheadImage':69} 218 219 def __init__(self, index): 220 IndexEntry.__init__(self, 0, 0) 221 self.index = index 222 223 tag = self.INDEX_MAP[index] 224 225 # The values for this index entry 226 # I dont know what the 5 means, it is not the number of entries 227 self.secondary = [5 if tag == min( 228 itervalues(self.INDEX_MAP)) else 0, 0, tag] 229 230 @property 231 def tag_nums(self): 232 yield 11 233 234 @property 235 def entry_type(self): 236 return 1 237 238 @classmethod 239 def entries(cls): 240 rmap = {v:k for k,v in iteritems(cls.INDEX_MAP)} 241 for tag in sorted(rmap, reverse=True): 242 yield cls(rmap[tag]) 243 244# }}} 245 246 247class TBS: # {{{ 248 249 ''' 250 Take the list of index nodes starting/ending on a record and calculate the 251 trailing byte sequence for the record. 252 ''' 253 254 def __init__(self, data, is_periodical, first=False, section_map={}, 255 after_first=False): 256 self.section_map = section_map 257 258 if is_periodical: 259 # The starting bytes. 260 # The value is zero which I think indicates the periodical 261 # index entry. The values for the various flags seem to be 262 # unused. If the 0b100 is present, it means that the record 263 # deals with section 1 (or is the final record with section 264 # transitions). 265 self.type_010 = encode_tbs(0, {0b010: 0}, flag_size=3) 266 self.type_011 = encode_tbs(0, {0b010: 0, 0b001: 0}, 267 flag_size=3) 268 self.type_110 = encode_tbs(0, {0b100: 2, 0b010: 0}, 269 flag_size=3) 270 self.type_111 = encode_tbs(0, {0b100: 2, 0b010: 0, 0b001: 271 0}, flag_size=3) 272 273 if not data: 274 byts = b'' 275 if after_first: 276 # This can happen if a record contains only text between 277 # the periodical start and the first section 278 byts = self.type_011 279 self.bytestring = byts 280 else: 281 depth_map = defaultdict(list) 282 for x in ('starts', 'ends', 'completes'): 283 for idx in data[x]: 284 depth_map[idx.depth].append(idx) 285 for l in itervalues(depth_map): 286 l.sort(key=lambda x:x.offset) 287 self.periodical_tbs(data, first, depth_map) 288 else: 289 if not data: 290 self.bytestring = b'' 291 else: 292 self.book_tbs(data, first) 293 294 def periodical_tbs(self, data, first, depth_map): 295 buf = io.BytesIO() 296 297 has_section_start = (depth_map[1] and 298 set(depth_map[1]).intersection(set(data['starts']))) 299 spanner = data['spans'] 300 parent_section_index = -1 301 302 if depth_map[0]: 303 # We have a terminal record 304 305 # Find the first non periodical node 306 first_node = None 307 for nodes in (depth_map[1], depth_map[2]): 308 for node in nodes: 309 if (first_node is None or (node.offset, node.depth) < 310 (first_node.offset, first_node.depth)): 311 first_node = node 312 313 typ = (self.type_110 if has_section_start else self.type_010) 314 315 # parent_section_index is needed for the last record 316 if first_node is not None and first_node.depth > 0: 317 parent_section_index = (first_node.index if first_node.depth == 1 else first_node.parent_index) 318 else: 319 parent_section_index = max(iter(self.section_map)) 320 321 else: 322 # Non terminal record 323 324 if spanner is not None: 325 # record is spanned by a single article 326 parent_section_index = spanner.parent_index 327 typ = (self.type_110 if parent_section_index == 1 else 328 self.type_010) 329 elif not depth_map[1]: 330 # has only article nodes, i.e. spanned by a section 331 parent_section_index = depth_map[2][0].parent_index 332 typ = (self.type_111 if parent_section_index == 1 else 333 self.type_010) 334 else: 335 # has section transitions 336 if depth_map[2]: 337 parent_section_index = depth_map[2][0].parent_index 338 else: 339 parent_section_index = depth_map[1][0].index 340 typ = self.type_011 341 342 buf.write(typ) 343 344 if typ not in (self.type_110, self.type_111) and parent_section_index > 0: 345 extra = {} 346 # Write starting section information 347 if spanner is None: 348 num_articles = len([a for a in depth_map[1] if a.parent_index == parent_section_index]) 349 if not depth_map[1]: 350 extra = {0b0001: 0} 351 if num_articles > 1: 352 extra = {0b0100: num_articles} 353 buf.write(encode_tbs(parent_section_index, extra)) 354 355 if spanner is None: 356 articles = depth_map[2] 357 sections = {self.section_map[a.parent_index] for a in 358 articles} 359 sections = sorted(sections, key=lambda x:x.offset) 360 section_map = {s:[a for a in articles if a.parent_index == 361 s.index] for s in sections} 362 for i, section in enumerate(sections): 363 # All the articles in this record that belong to section 364 articles = section_map[section] 365 first_article = articles[0] 366 last_article = articles[-1] 367 num = len(articles) 368 last_article_ends = (last_article in data['ends'] or 369 last_article in data['completes']) 370 371 try: 372 next_sec = sections[i+1] 373 except: 374 next_sec = None 375 376 extra = {} 377 if num > 1: 378 extra[0b0100] = num 379 if False and i == 0 and next_sec is not None: 380 # Write offset to next section from start of record 381 # I can't figure out exactly when Kindlegen decides to 382 # write this so I have disabled it for now. 383 extra[0b0001] = next_sec.offset - data['offset'] 384 385 buf.write(encode_tbs(first_article.index-section.index, extra)) 386 387 if next_sec is not None: 388 buf.write(encode_tbs(last_article.index-next_sec.index, 389 {0b1000: 0})) 390 391 # If a section TOC starts and extends into the next record add 392 # a trailing vwi. We detect this by TBS type==3, processing last 393 # section present in the record, and the last article in that 394 # section either ends or completes and doesn't finish 395 # on the last byte of the record. 396 elif (typ == self.type_011 and last_article_ends and 397 ((last_article.offset+last_article.size) % RECORD_SIZE > 0) 398 ): 399 buf.write(encode_tbs(last_article.index-section.index-1, 400 {0b1000: 0})) 401 402 else: 403 buf.write(encode_tbs(spanner.index - parent_section_index, 404 {0b0001: 0})) 405 406 self.bytestring = buf.getvalue() 407 408 def book_tbs(self, data, first): 409 spanner = data['spans'] 410 if spanner is not None: 411 self.bytestring = encode_tbs(spanner.index, {0b010: 0, 0b001: 0}, 412 flag_size=3) 413 else: 414 starts, completes, ends = (data['starts'], data['completes'], 415 data['ends']) 416 if (not completes and ( 417 (len(starts) == 1 and not ends) or (len(ends) == 1 and not 418 starts))): 419 node = starts[0] if starts else ends[0] 420 self.bytestring = encode_tbs(node.index, {0b010: 0}, flag_size=3) 421 else: 422 nodes = [] 423 for x in (starts, completes, ends): 424 nodes.extend(x) 425 nodes.sort(key=lambda x:x.index) 426 self.bytestring = encode_tbs(nodes[0].index, {0b010:0, 427 0b100: len(nodes)}, flag_size=3) 428 429# }}} 430 431 432class Indexer: # {{{ 433 434 def __init__(self, serializer, number_of_text_records, 435 size_of_last_text_record, masthead_offset, is_periodical, 436 opts, oeb): 437 self.serializer = serializer 438 self.number_of_text_records = number_of_text_records 439 self.text_size = (RECORD_SIZE * (self.number_of_text_records-1) + 440 size_of_last_text_record) 441 self.masthead_offset = masthead_offset 442 self.secondary_record_offset = None 443 444 self.oeb = oeb 445 self.log = oeb.log 446 self.opts = opts 447 448 self.is_periodical = is_periodical 449 if self.is_periodical and self.masthead_offset is None: 450 raise ValueError('Periodicals must have a masthead') 451 452 self.log('Generating MOBI index for a %s'%('periodical' if 453 self.is_periodical else 'book')) 454 self.is_flat_periodical = False 455 if self.is_periodical: 456 periodical_node = next(iter(oeb.toc)) 457 sections = tuple(periodical_node) 458 self.is_flat_periodical = len(sections) == 1 459 460 self.records = [] 461 462 if self.is_periodical: 463 # Ensure all articles have an author and description before 464 # creating the CNCX 465 for node in oeb.toc.iterdescendants(): 466 if node.klass == 'article': 467 aut, desc = node.author, node.description 468 if not aut: 469 aut = _('Unknown') 470 if not desc: 471 desc = _('No details available') 472 node.author, node.description = aut, desc 473 474 self.cncx = CNCX(oeb.toc, self.is_periodical) 475 476 if self.is_periodical: 477 self.indices = self.create_periodical_index() 478 else: 479 self.indices = self.create_book_index() 480 481 if not self.indices: 482 raise ValueError('No valid entries in TOC, cannot generate index') 483 484 self.records.append(self.create_index_record()) 485 self.records.insert(0, self.create_header()) 486 self.records.extend(self.cncx.records) 487 488 if is_periodical: 489 self.secondary_record_offset = len(self.records) 490 self.records.append(self.create_header(secondary=True)) 491 self.records.append(self.create_index_record(secondary=True)) 492 493 self.calculate_trailing_byte_sequences() 494 495 def create_index_record(self, secondary=False): # {{{ 496 header_length = 192 497 buf = io.BytesIO() 498 indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices 499 500 # Write index entries 501 offsets = [] 502 for i in indices: 503 offsets.append(buf.tell()) 504 buf.write(i.bytestring) 505 506 index_block = align_block(buf.getvalue()) 507 508 # Write offsets to index entries as an IDXT block 509 idxt_block = b'IDXT' 510 buf.seek(0), buf.truncate(0) 511 for offset in offsets: 512 buf.write(pack(b'>H', header_length+offset)) 513 idxt_block = align_block(idxt_block + buf.getvalue()) 514 body = index_block + idxt_block 515 516 header = b'INDX' 517 buf.seek(0), buf.truncate(0) 518 buf.write(pack(b'>I', header_length)) 519 buf.write(b'\0'*4) # Unknown 520 buf.write(pack(b'>I', 1)) # Header type? Or index record number? 521 buf.write(b'\0'*4) # Unknown 522 # IDXT block offset 523 buf.write(pack(b'>I', header_length + len(index_block))) 524 # Number of index entries 525 buf.write(pack(b'>I', len(offsets))) 526 # Unknown 527 buf.write(b'\xff'*8) 528 # Unknown 529 buf.write(b'\0'*156) 530 531 header += buf.getvalue() 532 533 ans = header + body 534 if len(ans) > 0x10000: 535 raise ValueError('Too many entries (%d) in the TOC'%len(offsets)) 536 return ans 537 # }}} 538 539 def create_header(self, secondary=False): # {{{ 540 buf = io.BytesIO() 541 if secondary: 542 tagx_block = TAGX().secondary 543 else: 544 tagx_block = (TAGX().periodical if self.is_periodical else 545 TAGX().flat_book) 546 header_length = 192 547 548 # Ident 0 - 4 549 buf.write(b'INDX') 550 551 # Header length 4 - 8 552 buf.write(pack(b'>I', header_length)) 553 554 # Unknown 8-16 555 buf.write(b'\0'*8) 556 557 # Index type: 0 - normal, 2 - inflection 16 - 20 558 buf.write(pack(b'>I', 2)) 559 560 # IDXT offset 20-24 561 buf.write(pack(b'>I', 0)) # Filled in later 562 563 # Number of index records 24-28 564 buf.write(pack(b'>I', 1 if secondary else len(self.records))) 565 566 # Index Encoding 28-32 567 buf.write(pack(b'>I', 65001)) # utf-8 568 569 # Unknown 32-36 570 buf.write(b'\xff'*4) 571 572 # Number of index entries 36-40 573 indices = list(SecondaryIndexEntry.entries()) if secondary else self.indices 574 buf.write(pack(b'>I', len(indices))) 575 576 # ORDT offset 40-44 577 buf.write(pack(b'>I', 0)) 578 579 # LIGT offset 44-48 580 buf.write(pack(b'>I', 0)) 581 582 # Number of LIGT entries 48-52 583 buf.write(pack(b'>I', 0)) 584 585 # Number of CNCX records 52-56 586 buf.write(pack(b'>I', 0 if secondary else len(self.cncx.records))) 587 588 # Unknown 56-180 589 buf.write(b'\0'*124) 590 591 # TAGX offset 180-184 592 buf.write(pack(b'>I', header_length)) 593 594 # Unknown 184-192 595 buf.write(b'\0'*8) 596 597 # TAGX block 598 buf.write(tagx_block) 599 600 num = len(indices) 601 602 # The index of the last entry in the NCX 603 idx = indices[-1].index 604 if isinstance(idx, numbers.Integral): 605 idx = encode_number_as_hex(idx) 606 else: 607 idx = idx.encode('ascii') 608 idx = (bytes(bytearray([len(idx)]))) + idx 609 buf.write(idx) 610 611 # The number of entries in the NCX 612 buf.write(pack(b'>H', num)) 613 614 # Padding 615 pad = (4 - (buf.tell()%4))%4 616 if pad: 617 buf.write(b'\0'*pad) 618 619 idxt_offset = buf.tell() 620 621 buf.write(b'IDXT') 622 buf.write(pack(b'>H', header_length + len(tagx_block))) 623 buf.write(b'\0') 624 buf.seek(20) 625 buf.write(pack(b'>I', idxt_offset)) 626 627 return align_block(buf.getvalue()) 628 # }}} 629 630 def create_book_index(self): # {{{ 631 indices = [] 632 seen = set() 633 id_offsets = self.serializer.id_offsets 634 635 # Flatten toc so that chapter to chapter jumps work with all sub 636 # chapter levels as well 637 for node in self.oeb.toc.iterdescendants(): 638 try: 639 offset = id_offsets[node.href] 640 label = self.cncx[node.title] 641 except: 642 self.log.warn('TOC item %s [%s] not found in document'%( 643 node.title, node.href)) 644 continue 645 646 if offset in seen: 647 continue 648 seen.add(offset) 649 650 indices.append(IndexEntry(offset, label)) 651 652 indices.sort(key=lambda x:x.offset) 653 654 # Set lengths 655 for i, index in enumerate(indices): 656 try: 657 next_offset = indices[i+1].offset 658 except: 659 next_offset = self.serializer.body_end_offset 660 index.length = next_offset - index.offset 661 662 # Remove empty indices 663 indices = [x for x in indices if x.length > 0] 664 665 # Reset lengths in case any were removed 666 for i, index in enumerate(indices): 667 try: 668 next_offset = indices[i+1].offset 669 except: 670 next_offset = self.serializer.body_end_offset 671 index.length = next_offset - index.offset 672 673 # Set index values 674 for index, x in enumerate(indices): 675 x.index = index 676 677 return indices 678 679 # }}} 680 681 def create_periodical_index(self): # {{{ 682 periodical_node = next(iter(self.oeb.toc)) 683 periodical_node_offset = self.serializer.body_start_offset 684 periodical_node_size = (self.serializer.body_end_offset - 685 periodical_node_offset) 686 687 normalized_sections = [] 688 689 id_offsets = self.serializer.id_offsets 690 691 periodical = PeriodicalIndexEntry(periodical_node_offset, 692 self.cncx[periodical_node.title], 693 self.cncx[periodical_node.klass], 0) 694 periodical.length = periodical_node_size 695 periodical.first_child_index = 1 696 periodical.image_index = self.masthead_offset 697 698 seen_sec_offsets = set() 699 seen_art_offsets = set() 700 701 for sec in periodical_node: 702 normalized_articles = [] 703 try: 704 offset = id_offsets[sec.href] 705 label = self.cncx[sec.title] 706 klass = self.cncx[sec.klass] 707 except: 708 continue 709 if offset in seen_sec_offsets: 710 continue 711 712 seen_sec_offsets.add(offset) 713 section = PeriodicalIndexEntry(offset, label, klass, 1) 714 section.parent_index = 0 715 716 for art in sec: 717 try: 718 offset = id_offsets[art.href] 719 label = self.cncx[art.title] 720 klass = self.cncx[art.klass] 721 except: 722 continue 723 if offset in seen_art_offsets: 724 continue 725 seen_art_offsets.add(offset) 726 article = PeriodicalIndexEntry(offset, label, klass, 2) 727 normalized_articles.append(article) 728 article.author_offset = self.cncx[art.author] 729 article.desc_offset = self.cncx[art.description] 730 if getattr(art, 'toc_thumbnail', None) is not None: 731 try: 732 ii = self.serializer.images[art.toc_thumbnail] - 1 733 if ii > -1: 734 article.image_index = ii 735 except KeyError: 736 pass # Image not found in serializer 737 738 if normalized_articles: 739 normalized_articles.sort(key=lambda x:x.offset) 740 normalized_sections.append((section, normalized_articles)) 741 742 normalized_sections.sort(key=lambda x:x[0].offset) 743 744 # Set lengths 745 for s, x in enumerate(normalized_sections): 746 sec, normalized_articles = x 747 try: 748 sec.length = normalized_sections[s+1][0].offset - sec.offset 749 except: 750 sec.length = self.serializer.body_end_offset - sec.offset 751 for i, art in enumerate(normalized_articles): 752 try: 753 art.length = normalized_articles[i+1].offset - art.offset 754 except: 755 art.length = sec.offset + sec.length - art.offset 756 757 # Filter 758 for i, x in list(enumerate(normalized_sections)): 759 sec, normalized_articles = x 760 normalized_articles = list(filter(lambda x: x.length > 0, 761 normalized_articles)) 762 normalized_sections[i] = (sec, normalized_articles) 763 764 normalized_sections = list(filter(lambda x: x[0].length > 0 and x[1], 765 normalized_sections)) 766 767 # Set indices 768 i = 0 769 for sec, articles in normalized_sections: 770 i += 1 771 sec.index = i 772 sec.parent_index = 0 773 774 for sec, articles in normalized_sections: 775 for art in articles: 776 i += 1 777 art.index = i 778 779 art.parent_index = sec.index 780 781 for sec, normalized_articles in normalized_sections: 782 sec.first_child_index = normalized_articles[0].index 783 sec.last_child_index = normalized_articles[-1].index 784 785 # Set lengths again to close up any gaps left by filtering 786 for s, x in enumerate(normalized_sections): 787 sec, articles = x 788 try: 789 next_offset = normalized_sections[s+1][0].offset 790 except: 791 next_offset = self.serializer.body_end_offset 792 sec.length = next_offset - sec.offset 793 794 for a, art in enumerate(articles): 795 try: 796 next_offset = articles[a+1].offset 797 except: 798 next_offset = sec.next_offset 799 art.length = next_offset - art.offset 800 801 # Sanity check 802 for s, x in enumerate(normalized_sections): 803 sec, articles = x 804 try: 805 next_sec = normalized_sections[s+1][0] 806 except: 807 if (sec.length == 0 or sec.next_offset != 808 self.serializer.body_end_offset): 809 raise ValueError('Invalid section layout') 810 else: 811 if next_sec.offset != sec.next_offset or sec.length == 0: 812 raise ValueError('Invalid section layout') 813 for a, art in enumerate(articles): 814 try: 815 next_art = articles[a+1] 816 except: 817 if (art.length == 0 or art.next_offset != 818 sec.next_offset): 819 raise ValueError('Invalid article layout') 820 else: 821 if art.length == 0 or art.next_offset != next_art.offset: 822 raise ValueError('Invalid article layout') 823 824 # Flatten 825 indices = [periodical] 826 for sec, articles in normalized_sections: 827 indices.append(sec) 828 periodical.last_child_index = sec.index 829 830 for sec, articles in normalized_sections: 831 for a in articles: 832 indices.append(a) 833 834 return indices 835 # }}} 836 837 # TBS {{{ 838 def calculate_trailing_byte_sequences(self): 839 self.tbs_map = {} 840 found_node = False 841 sections = [i for i in self.indices if i.depth == 1] 842 section_map = OrderedDict((i.index, i) for i in 843 sorted(sections, key=lambda x:x.offset)) 844 845 deepest = max(i.depth for i in self.indices) 846 847 for i in range(self.number_of_text_records): 848 offset = i * RECORD_SIZE 849 next_offset = offset + RECORD_SIZE 850 data = {'ends':[], 'completes':[], 'starts':[], 851 'spans':None, 'offset':offset, 'record_number':i+1} 852 853 for index in self.indices: 854 855 if index.offset >= next_offset: 856 # Node starts after current record 857 if index.depth == deepest: 858 break 859 else: 860 continue 861 if index.next_offset <= offset: 862 # Node ends before current record 863 continue 864 if index.offset >= offset: 865 # Node starts in current record 866 if index.next_offset <= next_offset: 867 # Node ends in current record 868 data['completes'].append(index) 869 else: 870 data['starts'].append(index) 871 else: 872 # Node starts before current records 873 if index.next_offset <= next_offset: 874 # Node ends in current record 875 data['ends'].append(index) 876 elif index.depth == deepest: 877 data['spans'] = index 878 879 if (data['ends'] or data['completes'] or data['starts'] or 880 data['spans'] is not None): 881 self.tbs_map[i+1] = TBS(data, self.is_periodical, first=not 882 found_node, section_map=section_map) 883 found_node = True 884 else: 885 self.tbs_map[i+1] = TBS({}, self.is_periodical, first=False, 886 after_first=found_node, section_map=section_map) 887 888 def get_trailing_byte_sequence(self, num): 889 return self.tbs_map[num].bytestring 890 # }}} 891 892# }}} 893