1# -*- coding: utf-8 -*- 2 3 4''' 5Write content to ereader pdb file. 6''' 7 8__license__ = 'GPL v3' 9__copyright__ = '2009, John Schember <john@nachtimwald.com>' 10__docformat__ = 'restructuredtext en' 11 12import io 13import re 14import struct 15import zlib 16 17from PIL import Image 18 19from calibre.ebooks.pdb.formatwriter import FormatWriter 20from calibre.ebooks.pdb.header import PdbHeaderBuilder 21from calibre.ebooks.pml.pmlml import PMLMLizer 22from polyglot.builtins import as_bytes 23 24IDENTITY = 'PNRdPPrs' 25 26# This is an arbitrary number that is small enough to work. The actual maximum 27# record size is unknown. 28MAX_RECORD_SIZE = 8192 29 30 31class Writer(FormatWriter): 32 33 def __init__(self, opts, log): 34 self.opts = opts 35 self.log = log 36 37 def write_content(self, oeb_book, out_stream, metadata=None): 38 pmlmlizer = PMLMLizer(self.log) 39 pml = str(pmlmlizer.extract_content(oeb_book, self.opts)).encode('cp1252', 'replace') 40 41 text, text_sizes = self._text(pml) 42 chapter_index = self._index_item(br'(?s)\\C(?P<val>[0-4])="(?P<text>.+?)"', pml) 43 chapter_index += self._index_item(br'(?s)\\X(?P<val>[0-4])(?P<text>.+?)\\X[0-4]', pml) 44 chapter_index += self._index_item(br'(?s)\\x(?P<text>.+?)\\x', pml) 45 link_index = self._index_item(br'(?s)\\Q="(?P<text>.+?)"', pml) 46 images = self._images(oeb_book.manifest, pmlmlizer.image_hrefs) 47 metadata = [self._metadata(metadata)] 48 hr = [self._header_record(len(text), len(chapter_index), len(link_index), len(images))] 49 50 ''' 51 Record order as generated by Dropbook. 52 1. eReader Header 53 2. Compressed text 54 3. Small font page index 55 4. Large font page index 56 5. Chapter index 57 6. Links index 58 7. Images 59 8. (Extrapolation: there should be one more record type here though yet uncovered what it might be). 60 9. Metadata 61 10. Sidebar records 62 11. Footnote records 63 12. Text block size record 64 13. "MeTaInFo\x00" word record 65 ''' 66 sections = hr+text+chapter_index+link_index+images+metadata+[text_sizes]+[b'MeTaInFo\x00'] 67 68 lengths = [len(i) if i not in images else len(i[0]) + len(i[1]) for i in sections] 69 70 pdbHeaderBuilder = PdbHeaderBuilder(IDENTITY, metadata[0].partition(b'\x00')[0]) 71 pdbHeaderBuilder.build_header(lengths, out_stream) 72 73 for item in sections: 74 if item in images: 75 out_stream.write(item[0]) 76 out_stream.write(item[1]) 77 else: 78 out_stream.write(item) 79 80 def _text(self, pml): 81 pml_pages = [] 82 text_sizes = b'' 83 index = 0 84 while index < len(pml): 85 ''' 86 Split on the space character closest to MAX_RECORD_SIZE when possible. 87 ''' 88 split = pml.rfind(b' ', index, MAX_RECORD_SIZE) 89 if split == -1: 90 len_end = len(pml[index:]) 91 if len_end > MAX_RECORD_SIZE: 92 split = MAX_RECORD_SIZE 93 else: 94 split = len_end 95 if split == 0: 96 split = 1 97 pml_pages.append(zlib.compress(pml[index:index+split])) 98 text_sizes += struct.pack('>H', split) 99 index += split 100 101 return pml_pages, text_sizes 102 103 def _index_item(self, regex, pml): 104 index = [] 105 for mo in re.finditer(regex, pml): 106 item = b'' 107 if 'text' in mo.groupdict().keys(): 108 item += struct.pack('>L', mo.start()) 109 text = mo.group('text') 110 # Strip all PML tags from text 111 text = re.sub(br'\\U[0-9a-z]{4}', '', text) 112 text = re.sub(br'\\a\d{3}', '', text) 113 text = re.sub(br'\\.', '', text) 114 # Add appropriate spacing to denote the various levels of headings 115 if 'val' in mo.groupdict().keys(): 116 text = b'%s%s' % (b' ' * 4 * int(mo.group('val')), text) 117 item += text 118 item += b'\x00' 119 if item: 120 index.append(item) 121 return index 122 123 def _images(self, manifest, image_hrefs): 124 ''' 125 Image format. 126 127 0-4 : 'PNG '. There must be a space after PNG. 128 4-36 : Image name. Must be exactly 32 bytes long. Pad with \x00 for names shorter than 32 bytes 129 36-58 : Unknown. 130 58-60 : Width. 131 60-62 : Height. 132 62-...: Raw image data in 8 bit PNG format. 133 ''' 134 images = [] 135 from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES 136 137 for item in manifest: 138 if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys(): 139 try: 140 im = Image.open(io.BytesIO(item.data)).convert('P') 141 im.thumbnail((300,300), Image.ANTIALIAS) 142 143 data = io.BytesIO() 144 im.save(data, 'PNG') 145 data = data.getvalue() 146 href = as_bytes(image_hrefs[item.href]) 147 148 header = b'PNG ' 149 header += href.ljust(32, b'\x00')[:32] 150 header = header.ljust(58, b'\x00') 151 header += struct.pack('>HH', im.size[0], im.size[1]) 152 header = header.ljust(62, b'\x00') 153 154 if len(data) + len(header) < 65505: 155 images.append((header, data)) 156 except Exception as e: 157 self.log.error('Error: Could not include file %s because ' 158 '%s.' % (item.href, e)) 159 160 return images 161 162 def _metadata(self, metadata): 163 ''' 164 Metadata takes the form: 165 title\x00 166 author\x00 167 copyright\x00 168 publisher\x00 169 isbn\x00 170 ''' 171 172 title = _('Unknown') 173 author = _('Unknown') 174 copyright = '' 175 publisher = '' 176 isbn = '' 177 178 if metadata: 179 if len(metadata.title) >= 1: 180 title = metadata.title[0].value 181 if len(metadata.creator) >= 1: 182 from calibre.ebooks.metadata import authors_to_string 183 author = authors_to_string([x.value for x in metadata.creator]) 184 if len(metadata.rights) >= 1: 185 copyright = metadata.rights[0].value 186 if len(metadata.publisher) >= 1: 187 publisher = metadata.publisher[0].value 188 189 return as_bytes('%s\x00%s\x00%s\x00%s\x00%s\x00' % (title, author, copyright, publisher, isbn)) 190 191 def _header_record(self, text_count, chapter_count, link_count, image_count): 192 ''' 193 text_count = the number of text pages 194 image_count = the number of images 195 ''' 196 compression = 10 # zlib compression. 197 non_text_offset = text_count + 1 198 199 chapter_offset = non_text_offset 200 link_offset = chapter_offset + chapter_count 201 202 if image_count > 0: 203 image_data_offset = link_offset + link_count 204 meta_data_offset = image_data_offset + image_count 205 last_data_offset = meta_data_offset + 1 206 else: 207 meta_data_offset = link_offset + link_count 208 last_data_offset = meta_data_offset + 1 209 image_data_offset = last_data_offset 210 211 if chapter_count == 0: 212 chapter_offset = last_data_offset 213 if link_count == 0: 214 link_offset = last_data_offset 215 216 record = b'' 217 218 record += struct.pack('>H', compression) # [0:2] # Compression. Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM 219 record += struct.pack('>H', 0) # [2:4] # Unknown. 220 record += struct.pack('>H', 0) # [4:6] # Unknown. 221 record += struct.pack('>H', 25152) # [6:8] # 25152 is MAGIC. Somehow represents the cp1252 encoding of the text 222 record += struct.pack('>H', 0) # [8:10] # Number of small font pages. 0 if page index is not built. 223 record += struct.pack('>H', 0) # [10:12] # Number of large font pages. 0 if page index is not built. 224 record += struct.pack('>H', non_text_offset) # [12:14] # Non-Text record start. 225 record += struct.pack('>H', chapter_count) # [14:16] # Number of chapter index records. 226 record += struct.pack('>H', 0) # [16:18] # Number of small font page index records. 227 record += struct.pack('>H', 0) # [18:20] # Number of large font page index records. 228 record += struct.pack('>H', image_count) # [20:22] # Number of images. 229 record += struct.pack('>H', link_count) # [22:24] # Number of links. 230 record += struct.pack('>H', 1) # [24:26] # 1 if has metadata, 0 if not. 231 record += struct.pack('>H', 0) # [26:28] # Unknown. 232 record += struct.pack('>H', 0) # [28:30] # Number of Footnotes. 233 record += struct.pack('>H', 0) # [30:32] # Number of Sidebars. 234 record += struct.pack('>H', chapter_offset) # [32:34] # Chapter index offset. 235 record += struct.pack('>H', 2560) # [34:36] # 2560 is MAGIC. 236 record += struct.pack('>H', last_data_offset) # [36:38] # Small font page offset. This will be the last data offset if there are none. 237 record += struct.pack('>H', last_data_offset) # [38:40] # Large font page offset. This will be the last data offset if there are none. 238 record += struct.pack('>H', image_data_offset) # [40:42] # Image offset. This will be the last data offset if there are none. 239 record += struct.pack('>H', link_offset) # [42:44] # Links offset. This will be the last data offset if there are none. 240 record += struct.pack('>H', meta_data_offset) # [44:46] # Metadata offset. This will be the last data offset if there are none. 241 record += struct.pack('>H', 0) # [46:48] # Unknown. 242 record += struct.pack('>H', last_data_offset) # [48:50] # Footnote offset. This will be the last data offset if there are none. 243 record += struct.pack('>H', last_data_offset) # [50:52] # Sidebar offset. This will be the last data offset if there are none. 244 record += struct.pack('>H', last_data_offset) # [52:54] # Last data offset. 245 246 for i in range(54, 132, 2): 247 record += struct.pack('>H', 0) # [54:132] 248 249 return record 250