1 #include "../include/pdbfmt.h"
2 #include "../include/crlog.h"
3 #include <ctype.h>
4
5 // uncomment following line to save PDB content streams to /tmp
6 //#define DUMP_PDB_CONTENTS
7
8 struct PDBHdr
9 {
10 lUInt8 name[32];
11 lUInt16 attributes;
12 lUInt16 version;
13 lUInt32 creationDate;
14 lUInt32 modificationDate;
15 lUInt32 lastBackupDate;
16 lUInt32 modificationNumber;
17 lUInt32 appInfoID;
18 lUInt32 sortInfoID;
19 lUInt8 type[4];
20 lUInt8 creator[4];
21 lUInt32 uniqueIDSeed;
22 lUInt32 nextRecordList;
23 lUInt16 recordCount;
24 lUInt16 firstEntry;
readPDBHdr25 bool read( LVStreamRef stream ) {
26 // TODO: byte order support
27 lvsize_t bytesRead = 0;
28 if ( stream->Read(this, sizeof(PDBHdr), &bytesRead )!=LVERR_OK )
29 return false;
30 if ( bytesRead!=sizeof(PDBHdr) )
31 return false;
32 lvByteOrderConv cnv;
33 if ( cnv.lsf() )
34 {
35 cnv.rev(&attributes);
36 cnv.rev(&version);
37 cnv.rev(&creationDate);
38 cnv.rev(&modificationDate);
39 cnv.rev(&lastBackupDate);
40 cnv.rev(&modificationNumber);
41 cnv.rev(&appInfoID);
42 cnv.rev(&sortInfoID);
43 cnv.rev(&uniqueIDSeed);
44 cnv.rev(&nextRecordList);
45 cnv.rev(&recordCount);
46 cnv.rev(&firstEntry);
47 }
48 return true;
49 }
checkTypePDBHdr50 bool checkType( const char * str ) {
51 return type[0]==str[0] && type[1]==str[1] && type[2]==str[2] && type[3]==str[3];
52 }
53
checkCreatorPDBHdr54 bool checkCreator( const char * str ) {
55 return creator[0]==str[0] && creator[1]==str[1] && creator[2]==str[2] && creator[3]==str[3];
56 }
57 };
58
59 struct PDBRecordEntry
60 {
61 lUInt32 localChunkId;
62 lUInt8 attributes[4];
63 //lUInt8 uniqueID[3];
readPDBRecordEntry64 bool read( LVStreamRef stream ) {
65 // TODO: byte order support
66 lvsize_t bytesRead = 0;
67 if ( stream->Read(this, sizeof(PDBRecordEntry), &bytesRead )!=LVERR_OK )
68 return false;
69 if ( bytesRead!=sizeof(PDBRecordEntry) )
70 return false;
71 lvByteOrderConv cnv;
72 if ( cnv.lsf() )
73 {
74 cnv.rev(&localChunkId);
75 }
76 return true;
77 }
78 };
79
80 struct PalmDocPreamble
81 {
82 lUInt16 compression; // 2 Compression 1 == no compression, 2 = PalmDOC compression (see below)
83 lUInt16 unused; // 2 Unused Always zero
84 lUInt32 textLength; // 4 text length Uncompressed length of the entire text of the book
85 lUInt16 recordCount; // 2 record count Number of PDB records used for the text of the book.
86 lUInt16 recordSize; // 2 record size Maximum size of each record containing text, always 4096
readPalmDocPreamble87 bool read( LVStreamRef stream ) {
88 // TODO: byte order support
89 lvsize_t bytesRead = 0;
90 if ( stream->Read(this, sizeof(PalmDocPreamble), &bytesRead )!=LVERR_OK )
91 return false;
92 if ( bytesRead!=sizeof(PalmDocPreamble) )
93 return false;
94 lvByteOrderConv cnv;
95 if ( cnv.lsf() )
96 {
97 cnv.rev(&compression); // 2 Compression 1 == no compression, 2 = PalmDOC compression (see below)
98 cnv.rev(&textLength); // 4 text length Uncompressed length of the entire text of the book
99 cnv.rev(&recordCount); // 2 record count Number of PDB records used for the text of the book.
100 cnv.rev(&recordSize); // 2 record size Maximum size of each record containing text, always 4096
101 }
102 if ( compression!=1 && compression!=2 )
103 return false;
104 return true;
105 }
106 };
107
108 struct MobiPreamble : public PalmDocPreamble
109 {
110 lUInt16 mobiEncryption; // 2 Encryption Type 0 == no encryption, 1 = Old Mobipocket Encryption, 2 = Mobipocket Encryption
111 lUInt16 unused2; // 2 unknown, usually 0
112
113 lUInt8 mobiSignature[4]; // 16 4 identifier the characters M O B I
114 lUInt32 hederLength; // 20 4 header length the length of the MOBI header, including the previous 4 bytes
115 lUInt32 mobiType; // 24 4 Mobi type The kind of Mobipocket file this is
116 // 2 Mobipocket Book
117 // 3 PalmDoc Book
118 // 4 Audio
119 // 257 News
120 // 258 News_Feed
121 // 259 News_Magazine
122 // 513 PICS
123 // 514 WORD
124 // 515 XLS
125 // 516 PPT
126 // 517 TEXT
127 // 518 HTML
128 lUInt32 encoding; // 28 4 text Encoding 1252 = CP1252 (WinLatin1); 65001 = UTF-8
129 lUInt32 uid; // 32 4 Unique-ID Some kind of unique ID number (random?)
130 lUInt32 fileVersion; // 36 4 File version Version of the Mobipocket format used in this file.
131 lUInt32 reserved[10]; // 40 40 Reserved all 0xFF. In case of a dictionary, or some newer file formats, a few bytes are used from this range of 40 0xFFs
132 lUInt32 firstNonBookIndex; // 80 4 First Non-book index? First record number (starting with 0) that's not the book's text
133 lUInt32 fullNameOffset; // 84 4 Full Name Offset Offset in record 0 (not from start of file) of the full name of the book
134 lUInt32 fullNameLength; // 88 4 Full Name Length Length in bytes of the full name of the book
135 lUInt32 locale; // 92 4 Locale Book locale code. Low byte is main language 09= English, next byte is dialect, 08 = British, 04 = US. Thus US English is 1033, UK English is 2057.
136 lUInt32 inputLanguage; // 96 4 Input Language Input language for a dictionary
137 lUInt32 outputLanguage; // 100 4 Output Language Output language for a dictionary
138 lUInt32 minVersion; // 104 4 Min version Minimum mobipocket version support needed to read this file.
139 lUInt32 firstImageIndex; // 108 4 First Image index? First record number (starting with 0) that contains an image. Image records should be sequential.
140 lUInt32 huffmanRecordOffset; // 112 4 Huffman Record Offset The record number of the first huffman compression record.
141 lUInt32 huffmanRecordCount; // 116 4 Huffman Record Count The number of huffman compression records.
142 lUInt32 reserved2[2]; // 120 8 ? eight bytes, often zeros
143 lUInt32 mobiFlags; // 128 4 EXTH flags bitfield. if bit 6 (0x40) is set, then there's an EXTH record
144 lUInt32 unknown3[8]; // 132 32 ? 32 unknown bytes, if MOBI is long enough
145 lUInt32 drmOffset; // 164 4 DRM Offset Offset to DRM key info in DRMed files. 0xFFFFFFFF if no DRM
146 lUInt32 drmCount; // 168 4 DRM Count Number of entries in DRM info. 0xFFFFFFFF if no DRM
147 lUInt32 drmSize; // 172 4 DRM Size Number of bytes in DRM info.
148 lUInt32 drmFlags; // 176 4 DRM Flags Some flags concerning the DRM info.
149
150
readMobiPreamble151 bool read( LVStreamRef stream, lUInt16 & extraDataFlags ) {
152 extraDataFlags = 0;
153 lvsize_t bytesRead = 0;
154 if ( stream->Read(this, sizeof(MobiPreamble), &bytesRead )!=LVERR_OK )
155 return false;
156 if ( bytesRead!=sizeof(MobiPreamble) )
157 return false;
158 lvByteOrderConv cnv;
159 if ( cnv.lsf() )
160 {
161 cnv.rev(&compression); // 2 Compression 1 == no compression, 2 = PalmDOC compression (see below)
162 cnv.rev(&textLength); // 4 text length Uncompressed length of the entire text of the book
163 cnv.rev(&recordCount); // 2 record count Number of PDB records used for the text of the book.
164 cnv.rev(&recordSize); // 2 record size Maximum size of each record containing text, always 4096
165 cnv.rev(&mobiEncryption);// 2 Encryption Type 0 == no encryption, 1 = Old Mobipocket Encryption, 2 = Mobipocket Encryption
166 cnv.rev(&hederLength); // 20 4 header length the length of the MOBI header, including the previous 4 bytes
167 cnv.rev(&mobiType); // 24 4 Mobi type The kind of Mobipocket file this is
168 cnv.rev(&encoding); // 28 4 text Encoding 1252 = CP1252 (WinLatin1); 65001 = UTF-8
169 cnv.rev(&uid); // 32 4 Unique-ID Some kind of unique ID number (random?)
170 cnv.rev(&fileVersion); // 36 4 File version Version of the Mobipocket format used in this file.
171 cnv.rev(&firstNonBookIndex); // 80 4 First Non-book index? First record number (starting with 0) that's not the book's text
172 cnv.rev(&fullNameOffset); // 84 4 Full Name Offset Offset in record 0 (not from start of file) of the full name of the book
173 cnv.rev(&fullNameLength); // 88 4 Full Name Length Length in bytes of the full name of the book
174 cnv.rev(&locale); // 92 4 Locale Book locale code. Low byte is main language 09= English, next byte is dialect, 08 = British, 04 = US. Thus US English is 1033, UK English is 2057.
175 cnv.rev(&inputLanguage); // 96 4 Input Language Input language for a dictionary
176 cnv.rev(&outputLanguage); // 100 4 Output Language Output language for a dictionary
177 cnv.rev(&minVersion); // 104 4 Min version Minimum mobipocket version support needed to read this file.
178 cnv.rev(&firstImageIndex); // 108 4 First Image index? First record number (starting with 0) that contains an image. Image records should be sequential.
179 cnv.rev(&huffmanRecordOffset); // 112 4 Huffman Record Offset The record number of the first huffman compression record.
180 cnv.rev(&huffmanRecordCount); // 116 4 Huffman Record Count The number of huffman compression records.
181 cnv.rev(&mobiFlags); // 128 4 EXTH flags bitfield. if bit 6 (0x40) is set, then there's an EXTH record
182 cnv.rev(&drmOffset); // 164 4 DRM Offset Offset to DRM key info in DRMed files. 0xFFFFFFFF if no DRM
183 cnv.rev(&drmCount); // 168 4 DRM Count Number of entries in DRM info. 0xFFFFFFFF if no DRM
184 cnv.rev(&drmSize); // 172 4 DRM Size Number of bytes in DRM info.
185 cnv.rev(&drmFlags); // 176 4 DRM Flags Some flags concerning the DRM info.
186 }
187 if ( compression!=1 && compression!=2 )
188 return false;
189 if ( mobiType!=2 && mobiType!=3 && mobiType!=517 && mobiType!=518
190 && mobiType!=257 && mobiType!=258 && mobiType!=259 )
191 return false; // unsupported type
192 if ( mobiEncryption!=0 )
193 return false; // encryption is not supported
194 if ( hederLength >= 0xE4 ) {
195 stream->Seek(242-180, LVSEEK_CUR, NULL);
196 stream->Read(&extraDataFlags);
197 if ( cnv.lsf() )
198 cnv.rev(&extraDataFlags);
199 // if (extraDataFlags) {
200 // CRLog::trace("extraDataFlags=%04x", (int)extraDataFlags);
201 // }
202 }
203 return true;
204 }
205 };
206
207 // format description from http://wiki.mobileread.com/wiki/EReader
208 struct EReaderHeader
209 {
210 lUInt16 compression; // 0-2 compression Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
211 lUInt16 unknown1[2]; // 2-6 unknown Value of 0 is used
212 lUInt16 encoding; // 6-8 encoding Always 25152 (0x6240). All text must be encoded as Latin-1 cp1252
213 lUInt16 smallPageCount; // 8-10 Number of small pages The number of small font pages. If page index is not build in then 0.
214 lUInt16 largePageCount; // 10-12 Number of large pages The number of large font pages. If page index is not build in then 0.
215 lUInt16 nonTextRecordStart; //12-14 Non-Text record start The location of the first non text records. record 1 to this value minus 1 are all text records
216 lUInt16 numberOfChapters;// 14-16 Number of chapters The number of chapter index records contained in the file
217 lUInt16 smallPageRecordCount; // 16-18 Number of small index The number of small font page index records contained in the file
218 lUInt16 largePageRecordCount; // 18-20 Number of large index The number of large font page index records contained in the file
219 lUInt16 imageCount; // 20-22 Number of images The number of images contained in the file
220 lUInt16 linkCount; // 22-24 Number of links The number of links contained in the file
221 lUInt16 metadataAvailable; // 24-26 Metadata avaliable Is there a metadata record in the file? 0 = None, 1 = There is a metadata record
222 lUInt16 unknown2; // 26-28 Unknown Value of 0 is used
223 lUInt16 footnoteRecordsCount; // 28-30 Number of Footnotes The number of footnote records in the file
224 lUInt16 sidebarRecordsCount; // 30-32 Number of Sidebars The number of sidebar records in the file
225 lUInt16 chapterIndexStart; // 32-34 Chapter index record start The location of chapter index records. If there are no chapters use the value for the Last data record.
226 lUInt16 unknown3; // 34-36 2560 Magic value that must be set to 2560
227 lUInt16 smallPageIndexStart; // 36-38 Small page index start The location of small font page index records. If page table is not built in use the value for the Last data record.
228 lUInt16 largePageIndexStart; // 38-40 Large page index start The location of large font page index records. If page table is not built in use the value for the Last data record.
229 lUInt16 imageDataRecordStart; // 40-42 Image data record start The location of the first image record. If there are no images use the value for the Last data record.
230 lUInt16 linksRecordStart; // 42-44 Links record start The location of the first link index record. If there are no links use the value for the Last data record.
231 lUInt16 metadataRecordStart; // 44-46 Metadata record start The location of the metadata record. If there is no metadata use the value for the Last data record.
232 lUInt16 unknown4; // 46-48 Unknown Value of 0 is used
233 lUInt16 footnoteRecordStart; // 48-50 Footnote record start The location of the first footnote record. If there are no footnotes use the value for the Last data record.
234 lUInt16 sidebarRecordStart; // 50-52 Sidebar record start The location of the first sidebar record. If there are no sidebars use the value for the Last data record.
235 lUInt16 lastDataRecord; // 52-54 Last data record The location of the last data record
236 lUInt16 unknown5[39]; // 54-132 Unknown Value of 0 is used
readEReaderHeader237 bool read( LVStreamRef stream ) {
238 lvsize_t bytesRead = 0;
239 if ( stream->Read(this, sizeof(EReaderHeader), &bytesRead )!=LVERR_OK )
240 return false;
241 if ( bytesRead!=sizeof(EReaderHeader) )
242 return false;
243 lvByteOrderConv cnv;
244 if ( cnv.lsf() )
245 {
246 cnv.rev(&compression); // 0-2 compression Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
247 cnv.rev(&encoding); // 6-8 encoding Always 25152 (0x6240). All text must be encoded as Latin-1 cp1252
248 cnv.rev(&smallPageCount); // 8-10 Number of small pages The number of small font pages. If page index is not build in then 0.
249 cnv.rev(&largePageCount); // 10-12 Number of large pages The number of large font pages. If page index is not build in then 0.
250 cnv.rev(&nonTextRecordStart); //12-14 Non-Text record start The location of the first non text records. record 1 to this value minus 1 are all text records
251 cnv.rev(&numberOfChapters);// 14-16 Number of chapters The number of chapter index records contained in the file
252 cnv.rev(&smallPageRecordCount); // 16-18 Number of small index The number of small font page index records contained in the file
253 cnv.rev(&largePageRecordCount); // 18-20 Number of large index The number of large font page index records contained in the file
254 cnv.rev(&imageCount); // 20-22 Number of images The number of images contained in the file
255 cnv.rev(&linkCount); // 22-24 Number of links The number of links contained in the file
256 cnv.rev(&metadataAvailable); // 24-26 Metadata avaliable Is there a metadata record in the file? 0 = None, 1 = There is a metadata record
257 cnv.rev(&footnoteRecordsCount); // 28-30 Number of Footnotes The number of footnote records in the file
258 cnv.rev(&sidebarRecordsCount); // 30-32 Number of Sidebars The number of sidebar records in the file
259 cnv.rev(&chapterIndexStart); // 32-34 Chapter index record start The location of chapter index records. If there are no chapters use the value for the Last data record.
260 cnv.rev(&smallPageIndexStart); // 36-38 Small page index start The location of small font page index records. If page table is not built in use the value for the Last data record.
261 cnv.rev(&largePageIndexStart); // 38-40 Large page index start The location of large font page index records. If page table is not built in use the value for the Last data record.
262 cnv.rev(&imageDataRecordStart); // 40-42 Image data record start The location of the first image record. If there are no images use the value for the Last data record.
263 cnv.rev(&linksRecordStart); // 42-44 Links record start The location of the first link index record. If there are no links use the value for the Last data record.
264 cnv.rev(&metadataRecordStart); // 44-46 Metadata record start The location of the metadata record. If there is no metadata use the value for the Last data record.
265 cnv.rev(&footnoteRecordStart); // 48-50 Footnote record start The location of the first footnote record. If there are no footnotes use the value for the Last data record.
266 cnv.rev(&sidebarRecordStart); // 50-52 Sidebar record start The location of the first sidebar record. If there are no sidebars use the value for the Last data record.
267 cnv.rev(&lastDataRecord); // 52-54 Last data record The location of the last data record
268 }
269 if ( compression!=1 && compression!=2 && compression!=10 )
270 return false;
271 return true;
272 }
273 };
274
275 struct PluckerPreamble {
276 lUInt32 signature; // 4 Numeric Must contain the value 0x6C6E6368.
277 lUInt16 hdrVersion; // 2 Numeric Must have the value 3.
278 lUInt16 hdrEncoding; // 2 Numeric Must have the value 0.
279 lUInt16 verStrWords; // 2 Numeric The number of two-byte words following, containing the version string.
280 // char 2 * verStrWords String NUL-terminated ISO Latin-1 string, padded at end if necessary with a zero byte to an even-byte boundary, containing a version string to display to the user containing version information for the document.
281 // pqaTitleWords 2 Numeric The number of two-byte words in the following pqaTitleStr.
282 // pqaTitleStr 2 * pqaTitleWords String NUL-terminated ISO Latin-1 string, padded at end if necessary with a zero byte to an even-byte boundary, containing a title string for iconic display of the document.
283 // iconWords 2 Numeric Number of two-byte words in the following icon image.
284 // icon 2 * iconWords Image Image (32x32) in Palm image format to be used as an icon to represent the document on a desktop-style display. The image may not use a custom color map.
285 // smIconWords 2 Numeric Number of two-byte words in the following icon image.
286 // smIcon 2 * smIconWords Image Small image (15x9) in Palm image format to be used as an icon to represent the document on a desktop-style display. The image may not use a custom color map.
287 };
288
289 /// unpack data from _compbuf to _buf
290 bool ldomUnpack( const lUInt8 * compbuf, int compsize, lUInt8 * &dstbuf, lUInt32 & dstsize );
291
292 class PDBFile;
293
294 class LVPDBContainerItem : public LVContainerItemInfo {
295 protected:
296 LVStreamRef _stream;
297 PDBFile * _file;
298 int _startBlock;
299 int _size;
300 lString32 _name;
301 public:
302 /// returns object size (file size or directory entry count)
GetSize(lvsize_t * pSize)303 virtual lverror_t GetSize( lvsize_t * pSize ) {
304 *pSize = _size;
305 return LVERR_OK;
306 }
GetSize() const307 virtual lvsize_t GetSize() const { return _size; }
GetName() const308 virtual const lChar32 * GetName() const { return _name.c_str(); }
GetFlags() const309 virtual lUInt32 GetFlags() const { return 0; }
IsContainer() const310 virtual bool IsContainer() const { return false; }
openStream()311 virtual LVStreamRef openStream() {
312 // TODO: implement stream creation
313 return LVStreamRef();
314 }
LVPDBContainerItem(LVStreamRef stream,PDBFile * file,lString32 name,int startBlockIndex,int size)315 LVPDBContainerItem( LVStreamRef stream, PDBFile * file, lString32 name, int startBlockIndex, int size )
316 : _stream(stream), _file(file), _startBlock(startBlockIndex), _size(size), _name(name) {
317 }
318 };
319
320 class LVPDBRegionContainerItem : public LVPDBContainerItem {
321 public:
322 /// returns object size (file size or directory entry count)
GetFlags() const323 virtual lUInt32 GetFlags() const { return 0; }
openStream()324 virtual LVStreamRef openStream() {
325 // return region of base stream
326 return LVStreamRef( new LVStreamFragment( _stream, _startBlock, _size ) );
327 }
LVPDBRegionContainerItem(LVStreamRef stream,PDBFile * file,lString32 name,int startOffset,int size)328 LVPDBRegionContainerItem( LVStreamRef stream, PDBFile * file, lString32 name, int startOffset, int size )
329 : LVPDBContainerItem(stream, file, name, startOffset, size) {
330 }
331 };
332
333 class LVPDBContainer : public LVContainer
334 {
335 LVPtrVector<LVPDBContainerItem> _list;
336 LVStreamRef _stream;
337 public:
GetParentContainer()338 virtual LVContainer * GetParentContainer() { return NULL; }
339
addItem(LVPDBContainerItem * item)340 void addItem ( LVPDBContainerItem * item ) {
341 _list.add(item);
342 }
343
344 //virtual const LVContainerItemInfo * GetObjectInfo(const lChar32 * pname);
GetObjectInfo(int index)345 virtual const LVContainerItemInfo * GetObjectInfo(int index) {
346 if ( index>=0 && index<_list.length() )
347 return _list[index];
348 return NULL;
349 }
GetObjectCount() const350 virtual int GetObjectCount() const { return _list.length(); }
351 /// returns object size (file size or directory entry count)
GetSize(lvsize_t * pSize)352 virtual lverror_t GetSize( lvsize_t * pSize ) {
353 *pSize = _list.length();
354 return LVERR_OK;
355 }
356
OpenStream(const lChar32 * fname,lvopen_mode_t mode)357 virtual LVStreamRef OpenStream( const lChar32 * fname, lvopen_mode_t mode ) {
358 if ( mode!=LVOM_READ )
359 return LVStreamRef();
360 for ( int i=0; i<_list.length(); i++ ) {
361 //CRLog::trace("OpenStream(%s) : %s", LCSTR(lString32(fname)), LCSTR(lString32(_list[i]->GetName())) );
362 if ( !lStr_cmp(_list[i]->GetName(), fname) )
363 return _list[i]->openStream();
364 }
365 return LVStreamRef();
366 }
367
setStream(LVStreamRef stream)368 void setStream( LVStreamRef stream ) {
369 _stream = stream;
370 }
371
LVPDBContainer()372 LVPDBContainer( ) {
373 //_contentStream = LVStreamRef((LVStream*)file);
374 }
~LVPDBContainer()375 virtual ~LVPDBContainer() { }
376 };
377
pattern_cmp(const lUInt8 * buf,const char * pattern)378 static bool pattern_cmp( const lUInt8 * buf, const char * pattern ) {
379 for ( int i=0; pattern[i]; i++ )
380 if ( tolower(buf[i])!=pattern[i] )
381 return false;
382 return true;
383 }
384
385 class PDBFile : public LVNamedStream {
386 public:
387 enum Format {
388 UNKNOWN,
389 PALMDOC,
390 EREADER,
391 PLUCKER,
392 MOBI
393 };
394 private:
395
396 struct Record {
397 lUInt32 offset;
398 lUInt32 size;
399 lUInt32 unpoffset;
400 lUInt32 unpsize;
401 };
402 LVArray<Record> _records;
403 LVStreamRef _stream;
404 Format _format;
405 int _compression;
406 lUInt32 _textSize;
407 int _recordCount; // text record count
408 // read buffer
409 LVArray<lUInt8> _buf;
410 int _bufIndex;
411 lvpos_t _bufOffset;
412 lvsize_t _bufSize;
413 lvpos_t _pos;
414 lUInt16 _mobiExtraDataFlags;
415 CRPropRef m_doc_props;
416 //LVPDBContainer * _container;
unpack(LVArray<lUInt8> & dst,LVArray<lUInt8> & src)417 bool unpack( LVArray<lUInt8> & dst, LVArray<lUInt8> & src ) {
418 int srclen = src.length();
419 dst.reset();
420 dst.reserve(srclen);
421
422 if ( _compression==2 ) {
423 // PalmDOC
424 int pos = 0;
425 lInt32 b;
426
427 while (pos<srclen) {
428 b = src[pos];
429 pos++;
430 if (b > 0 && b < 9) {
431 // 1..8 bytes follow
432 if (pos + b > srclen)
433 break;
434 for (int i=0; i<(int)b; i++)
435 dst.add(src[pos++]);
436 } else if (b < 128) {
437 // unmodified single byte
438 dst.add((lUInt8)b);
439 } else if (b >= 0xc0) {
440 dst.add(' ');
441 dst.add(b & 0x7f);
442 } else {
443 if (pos >= srclen)
444 break;
445 lUInt32 z = ((b & 0x3f) << 8) + src[pos];
446 pos++;
447 int offset = z >> 3;
448 int size = (z & 7) + 3;
449 int srcpos = dst.length() - offset;
450 for (int i = 0; i < size; i++) {
451 if (srcpos >= 0) {
452 dst.add(dst[srcpos++]);
453 } else {
454 dst.add('?');
455 //CRLog::trace("wrong offset");
456 }
457 }
458 }
459 }
460 } else if ( _compression==10 ) {
461 // zlib
462 /// unpack data from _compbuf to _buf
463 lUInt8 * dstbuf;
464 lUInt32 dstsize;
465 if ( !ldomUnpack( src.get(), src.size(), dstbuf, dstsize ) )
466 return false;
467 dst.add(dstbuf, dstsize);
468 free(dstbuf);
469 } else if ( _compression==17480 ) {
470 // zlib
471 // TODO: shouldn't it be HUFFMAN unpacker?
472 /// unpack data from _compbuf to _buf
473 lUInt8 * dstbuf;
474 lUInt32 dstsize;
475 if ( !ldomUnpack( src.get(), src.size(), dstbuf, dstsize ) )
476 return false;
477 dst.add(dstbuf, dstsize);
478 free(dstbuf);
479 }
480 return true;
481 }
482
removeExtraData(int index,LVArray<lUInt8> & buf)483 void removeExtraData(int index, LVArray<lUInt8> & buf) {
484 if (index >= _records.length() || !_mobiExtraDataFlags)
485 return;
486 for (int flag = 0x8000; flag; flag >>= 1) {
487 if (!(_mobiExtraDataFlags & flag))
488 continue;
489 lInt32 n = buf[buf.length()-1];
490 if (flag == 1) {
491 n &= 3;
492
493 _records[index].size -= 1;
494 buf.erase(buf.length()-1, 1);
495
496 if (n>0) {
497 //CRLog::trace("block %d: removing %d bytes of multibyte character", index, n);
498
499 for (int i=n; i>0; i--) {
500 n = buf[buf.length() - 1];
501 if (!(n & 0x80))
502 break;
503 buf.erase(buf.length() - 1, 1);
504 if ((n & 0xC0) != 0x80)
505 break;
506 }
507 }
508
509 } else {
510 if (!(n & 0x80)) {
511 lUInt32 n2 = buf[buf.length()-2];
512 n = (n & 0x7F) | ((n2 & 0x7F) << 16);
513 } else {
514 n = n & 0x7F;
515 }
516 if (n > 0 && buf.length() >= n) {
517 //CRLog::trace("block %d: removing %d bytes of extra data type %d", index, n, flag);
518 _records[index].size -= n;
519 buf.erase(buf.length()-n, n);
520 }
521 }
522 // if (n && buf.length() >= n) {
523 // _records[index].size -= n;
524 // buf.erase(buf.length()-n, n);
525
526 // if (flag == 1 && n > 1) {
527 // CRLog::trace("block %d: removing %d bytes of multibyte character", index, n - 1);
528 // // remove extra utf-8 points
529 // while (buf.length()) {
530 // n = buf[buf.length() - 1];
531 // if (!(n & 0x80))
532 // break;
533 // buf.erase(buf.length() - 1, 1);
534 // if ((n & 0xC0) != 0x80)
535 // break;
536 // }
537 // }
538 // }
539 }
540 }
541
readRecordNoUnpack(int index,LVArray<lUInt8> * dstbuf)542 bool readRecordNoUnpack(int index, LVArray<lUInt8> * dstbuf) {
543 if (index >= _records.length())
544 return false;
545 dstbuf->reset();
546 dstbuf->addSpace(_records[index].size);
547 lvsize_t bytesRead = 0;
548 _stream->SetPos(_records[index].offset);
549 if (_stream->Read(dstbuf->get(), _records[index].size, &bytesRead) != LVERR_OK)
550 return false;
551 if (bytesRead != _records[index].size)
552 return false;
553 return true;
554 }
readRecord(int index,LVArray<lUInt8> * dstbuf)555 bool readRecord( int index, LVArray<lUInt8> * dstbuf ) {
556 if (index >= _records.length())
557 return false;
558 LVArray<lUInt8> srcbuf;
559 LVArray<lUInt8> * buf = _compression ? &srcbuf : dstbuf;
560 if ( buf->empty() )
561 buf->reserve(1); // ensure buf->_array is no more NULL for _stream->Read(dstbuf->get() above
562 if (!readRecordNoUnpack(index, buf))
563 return false;
564
565 if (_mobiExtraDataFlags && index < _recordCount)
566 removeExtraData(index, *buf);
567
568 if (!_compression)
569 return true;
570 // unpack
571 return unpack(*dstbuf, srcbuf);
572 }
573
readBlock(int index)574 bool readBlock( int index ) {
575 if ( index<0 || index>=_recordCount )
576 return false;
577 if ( index==_bufIndex )
578 return true; // already read
579 bool res = readRecord( index+1, &_buf );
580 if ( !res )
581 return false;
582 _bufIndex = index;
583 _bufOffset = _records[index+1].unpoffset;
584 _bufSize = _records[index+1].unpsize;
585 return true;
586 }
587
findBlock(lvpos_t pos)588 int findBlock( lvpos_t pos ) {
589 if ( pos==_textSize )
590 return _recordCount-1;
591 for ( int i=0; i<_recordCount; i++ ) {
592 if ( pos>=_records[i+1].unpoffset && pos<_records[i+1].unpoffset+_records[i+1].unpsize )
593 return i;
594 }
595 return -1;
596 }
597
seek(lvpos_t pos)598 bool seek( lvpos_t pos ) {
599 int index = findBlock(pos);
600 if ( index<0 )
601 return false;
602 bool res = readBlock( index );
603 if ( !res )
604 return false;
605 _pos = pos;
606 return true;
607 }
608
609 public:
610
611 // LVContainerRef getContainer() {
612 // if ( !_container )
613 // _container = new LVPDBContainer();
614 // return LVContainerRef(&_container);
615 // }
616
617
618 // static PDBFile * create( LVStreamRef stream, int & format ) {
619 // format = 0;
620 // PDBFile * res = new PDBFile();
621 // if ( res->open(stream, true, format) ) {
622 // format = res->_format;
623 // return res;
624 // }
625 // delete res;
626 // return NULL;
627 // }
628
detectFormat(doc_format_t & contentFormat)629 void detectFormat( doc_format_t & contentFormat ) {
630 if ( contentFormat == doc_format_none ) {
631 // autodetect format
632 LVArray<lUInt8> buf;
633 readRecord(1, &buf);
634 int bytesRead = buf.length();
635 if ( bytesRead>0 ) {
636 int pmlCount = 0;
637 int htmlCount = 0;
638 lString32 pmlChars("pXxCcriuovtnsblaUBSmqQI");
639 for ( int i=0; i<bytesRead-10; i++ ) {
640 const lUInt8 * p = buf.get() + i;
641 if ( p[0]=='\\' ) {
642 if ( pmlChars.pos(lString32((const lChar8 *)p+1, 1)) >=0 )
643 pmlCount++;
644 } else if (p[0]=='<') {
645 if ( pattern_cmp(p+1, "html") )
646 htmlCount+=100;
647 if ( pattern_cmp(p+1, "head") )
648 htmlCount+=50;
649 if ( pattern_cmp(p+1, "body") )
650 htmlCount+=50;
651 if ( pattern_cmp(p+1, "h1") || pattern_cmp(p+1, "h2") || pattern_cmp(p+1, "h3") || pattern_cmp(p+1, "h4"))
652 htmlCount+=5;
653 if ( pattern_cmp(p+1, "p>") || pattern_cmp(p+1, "b>") || pattern_cmp(p+1, "i>") || pattern_cmp(p+1, "li>") || pattern_cmp(p+1, "ul>"))
654 htmlCount+=10;
655 }
656 }
657 if ( pmlCount<5 && htmlCount<10 ) {
658 contentFormat = doc_format_txt;
659 } else if ( pmlCount > htmlCount ) {
660 contentFormat = doc_format_fb2;
661 } else {
662 contentFormat = doc_format_html;
663 }
664 }
665 SetPos(0);
666 }
667 }
668
getDocProps()669 CRPropRef getDocProps() {
670 return m_doc_props;
671 }
672
open(LVStreamRef stream,LVPDBContainer * container,bool validateContent,doc_format_t & contentFormat)673 bool open( LVStreamRef stream, LVPDBContainer * container, bool validateContent, doc_format_t & contentFormat ) {
674 contentFormat = doc_format_none;
675 _format = UNKNOWN;
676 stream->SetPos(0);
677 lUInt32 fsize = stream->GetSize();
678 PDBHdr hdr = { 0 };
679 PDBRecordEntry entry;
680 if ( !hdr.read(stream) )
681 return false;
682 if ( hdr.recordCount==0 )
683 return 0;
684
685 if ( hdr.checkType("TEXt") && hdr.checkCreator("REAd") )
686 _format = PALMDOC;
687 if ( hdr.checkType("PNRd") && hdr.checkCreator("PPrs") )
688 _format = EREADER;
689 if ( hdr.checkType("BOOK") && hdr.checkCreator("MOBI") )
690 _format = MOBI;
691 if ( hdr.checkType("Data") && hdr.checkCreator("Plkr") )
692 _format = PLUCKER;
693 // if ( hdr.checkType("ToGo") && hdr.checkCreator("ToGo") )
694 // _format = ISILO;
695 if ( _format==UNKNOWN )
696 return false; // UNKNOWN FORMAT
697
698 stream->SetPos(0x4E);
699 lUInt32 lastEntryStart = 0;
700 _records.addSpace(hdr.recordCount);
701 for ( int i=0; i<hdr.recordCount; i++ ) {
702 if ( !entry.read(stream) )
703 return false;
704 lUInt32 pos = entry.localChunkId;
705 if ( pos<lastEntryStart || pos>=fsize )
706 return false;
707 _records[i].offset = pos;
708 if ( i>0 )
709 _records[i-1].size = pos - _records[i-1].offset;
710 lastEntryStart = pos;
711 }
712 _records[_records.length()-1].size = fsize - _records[_records.length()-1].offset;
713
714
715 _stream = stream;
716
717 if ( _format==EREADER ) {
718 if ( _records[0].size<sizeof(EReaderHeader) )
719 return false;
720 EReaderHeader preamble = { 0 };
721 stream->SetPos(_records[0].offset);
722 if ( !preamble.read(stream) )
723 return false; // invalid preamble
724 _recordCount = preamble.nonTextRecordStart - 1;
725 if ( _recordCount>=_records.length() )
726 return false;
727 _compression = preamble.compression;
728 if ( _compression==1 )
729 _compression = 0;
730 _textSize = (lUInt32)-1;
731 if ( preamble.imageCount && container ) {
732 for ( int index=preamble.imageDataRecordStart; index<preamble.imageDataRecordStart+preamble.imageCount; index++ ) {
733 lUInt32 start = _records[index].offset + 62;
734 lUInt32 size = _records[index].size - 62;
735 if ( start<fsize && start+size<=fsize ) {
736 stream->SetPos(_records[index].offset);
737 if ( stream->ReadByte()=='P' && stream->ReadByte()=='N' && stream->ReadByte()=='G' && stream->ReadByte()==' ' ) {
738 // header ok, adding item
739 char name[33] = { 0 };
740 lvsize_t bytesRead = 0;
741 stream->Read(name, 32, &bytesRead);
742 if ( name[0] ) {
743 lString32 fname = lString32(name);
744 container->addItem( new LVPDBRegionContainerItem( stream, this, fname, start, size ) );
745 }
746 }
747 }
748 }
749 }
750 } else if (_format==MOBI ) {
751 if ( _records[0].size<sizeof(MobiPreamble) )
752 return false;
753 if (!validateContent)
754 contentFormat = doc_format_pdb;
755
756 MobiPreamble preamble = {};
757 stream->SetPos(_records[0].offset);
758 if ( !preamble.read(stream, _mobiExtraDataFlags) )
759 return false; // invalid preamble
760 if ( preamble.recordCount>=_records.length() )
761 return false;
762 _compression = preamble.compression;
763 if ( _compression==1 )
764 _compression = 0;
765 _textSize = preamble.textLength;
766 _recordCount = preamble.firstNonBookIndex - 1;
767 lUInt32 coverOffset = (lUInt32)-1;
768 lUInt32 thumbOffset = 0;
769 if (preamble.mobiFlags & 0x40) {
770 // EXTH present
771 stream->SetPos(_records[0].offset + 16 + preamble.hederLength);
772 char exth_tag[4] = {0, 0, 0, 0};
773 stream->Read(&exth_tag, 4, NULL);
774 if (exth_tag[0] == 'E' && exth_tag[1] == 'X' && exth_tag[2] == 'T' && exth_tag[3] == 'H') {
775 CRLog::trace("EXTH record found");
776 lUInt32 hdrLen = 0;
777 lUInt32 recCount = 0;
778 lvByteOrderConv cnv;
779 stream->Read(&hdrLen);
780 stream->Read(&recCount);
781 if ( cnv.lsf() ) {
782 cnv.rev(&hdrLen);
783 cnv.rev(&recCount);
784 }
785 LVArray<lUInt8> buf2;
786 for (lUInt32 i=0; i<recCount; i++) {
787 lUInt32 recType = 0;
788 lUInt32 recLen = 0;
789 stream->Read(&recType);
790 stream->Read(&recLen);
791 if ( cnv.lsf() ) {
792 cnv.rev(&recType);
793 cnv.rev(&recLen);
794 }
795 buf2.reset();
796 if (recLen > 8) {
797 lvpos_t nextPos = stream->GetPos() + recLen - 8;
798 //================================
799 if (recLen == 12 && recType == 201) {
800 stream->Read(&coverOffset);
801 cnv.msf(&coverOffset);
802 } else if (recLen == 12 && recType == 202) {
803 stream->Read(&thumbOffset);
804 cnv.msf(&thumbOffset);
805 } else {
806 buf2.addSpace(recLen);
807 if (stream->Read(buf2.get(), recLen - 8, NULL) != LVERR_OK)
808 break;
809 if (recType == 100) {
810 lString8 author((const char *)buf2.get(), recLen - 8);
811 CRLog::trace("MOBI author: %s", author.c_str());
812 m_doc_props->setString(DOC_PROP_AUTHORS, Utf8ToUnicode(author));
813 } else if (recType == 105) {
814 lString8 s((const char *)buf2.get(), recLen - 8);
815 CRLog::trace("MOBI subject: %s", s.c_str());
816 m_doc_props->setString(DOC_PROP_TITLE, Utf8ToUnicode(s));
817 }
818 }
819 //================================
820 stream->SetPos(nextPos);
821 }
822 }
823 }
824 }
825 if (container) {
826 for ( int index=preamble.firstImageIndex; index<_records.length(); index++ ) {
827 stream->SetPos(_records[index].offset);
828 lUInt8 buf[256];
829 stream->Read(buf, 16, NULL);
830 //CRLog::debug("Image record %d [%02x %02x %02x %02x %02x]", index, buf[0], buf[1], buf[2], buf[3], buf[4]);
831 const char * fmt = NULL;
832 if (buf[0]==0xff && buf[1]==0xd8 && buf[2]==0xFF && buf[3]==0xe0)
833 fmt = "jpeg";
834 if (buf[0]==0x89 && buf[1]=='P' && buf[2]=='N' && buf[3]=='G')
835 fmt = "png";
836 if (buf[0]=='G' && buf[1]=='I' && buf[2]=='F')
837 fmt = "gif";
838 if (fmt) {
839 lString32 name = lString32(MOBI_IMAGE_NAME_PREFIX) + fmt::decimal((int) (index - preamble.firstImageIndex + 1));
840 //CRLog::debug("Adding image %s [%d] %s", LCSTR(name), _records[index].size, fmt);
841 container->addItem( new LVPDBRegionContainerItem( stream, this, name, _records[index].offset, _records[index].size ) );
842 if ((unsigned)index == preamble.firstImageIndex + coverOffset) {
843 m_doc_props->setString(DOC_PROP_COVER_FILE, name);
844 CRLog::trace("MOBI COVER: %s", LCSTR(name));
845 }
846 }
847 }
848 }
849 } else if (_format==PALMDOC ) {
850 if ( _records[0].size<sizeof(PalmDocPreamble) )
851 return false;
852 PalmDocPreamble preamble = { 0 };
853 stream->SetPos(_records[0].offset);
854 if ( !preamble.read(stream) )
855 return false; // invalid preamble
856 if ( preamble.recordCount>=_records.length() )
857 return false;
858 _compression = preamble.compression;
859 if ( _compression==1 )
860 _compression = 0;
861 _textSize = preamble.textLength;
862 _recordCount = preamble.recordCount;
863 } else if (_format==PLUCKER ) {
864 // TODO
865 return false;
866 }
867
868 // if (_mobiExtraDataFlag) {
869 // // remove extra data
870 // for ( int k=1; k<_recordCount; k++ )
871 // _records[k+1].size -= 6;
872 // }
873
874 //#ifdef DUMP_PDB_CONTENTS
875 // int unpoffset2 = 0;
876 // FILE * out = fopen("/tmp/pdbout.txt", "wb");
877 // int k;
878 // for (k=1; k <= _recordCount && unpoffset2 < this->_textSize; k++) {
879 // LVArray<lUInt8> dst;
880 // readRecordNoUnpack(k, &_buf);
881 // if (_mobiExtraDataFlags) {
882 // removeExtraData(k, _buf);
883 //// int b = _buf[_buf.length()-1];
884 //// CRLog::trace("Extra data: %d bytes", b);
885 //// _records[k].size -= b;
886 //// _buf.erase(_buf.length()-1-b, b);
887 // }
888 // if (_compression == 2) {
889 // unpack(dst, _buf);
890 // _records[k].unpoffset = unpoffset2;
891 // _records[k].unpsize = dst.length();
892 // unpoffset2 += dst.length();
893 // fwrite(dst.get(), dst.length(), 1, out);
894 // fprintf(out, "\n[block %d end]\n", k);
895 // }
896 // CRLog::trace("record[%d] : %06x %06x - %06x %06x", k, _records[k].offset, _records[k].size, _records[k].unpoffset, _records[k].unpsize);
897 // }
898 // fclose(out);
899 // CRLog::trace("totalUncompSizeHdr=%06x realUncompSize=%06x %d blocks of %d", this->_textSize, unpoffset2, k, _records.length());
900 //#endif
901
902 if ( !validateContent )
903 return true; // for simple format check
904
905 LVArray<lUInt8> buf;
906 lUInt32 unpoffset = 0;
907 _crc = 0;
908 for ( int k=0; k<_recordCount; k++ ) {
909
910 readRecord(k+1, &buf);
911 _records[k+1].unpoffset = unpoffset;
912 _records[k+1].unpsize = buf.length();
913 unpoffset += buf.length();
914 _crc = lStr_crc32( _crc, buf.get(), buf.length() );
915 }
916 _mobiExtraDataFlags = 0;
917
918
919 detectFormat( contentFormat );
920
921
922
923 #ifdef DUMP_PDB_CONTENTS
924 {
925 int unpoffset2 = 0;
926 FILE * out = fopen("/tmp/pdbout.txt", "wb");
927 int k;
928 for (k=1; k <= _recordCount && unpoffset2 < this->_textSize; k++) {
929 LVArray<lUInt8> dst;
930 readRecordNoUnpack(k, &_buf);
931 // if (_mobiExtraDataFlags) {
932 // removeExtraData(k, _buf);
933 // int b = _buf[_buf.length()-1];
934 // CRLog::trace("Extra data: %d bytes", b);
935 // _records[k].size -= b;
936 // _buf.erase(_buf.length()-1-b, b);
937 // }
938 if (_compression == 2) {
939 unpack(dst, _buf);
940 _records[k].unpoffset = unpoffset2;
941 _records[k].unpsize = dst.length();
942 unpoffset2 += dst.length();
943 fwrite(dst.get(), dst.length(), 1, out);
944 fprintf(out, "\n[block %d end]\n", k);
945 }
946 CRLog::trace("record[%d] : %06x %06x - %06x %06x", k, _records[k].offset, _records[k].size, _records[k].unpoffset, _records[k].unpsize);
947 }
948 fclose(out);
949 CRLog::trace("totalUncompSizeHdr=%06x realUncompSize=%06x %d blocks of %d", this->_textSize, unpoffset2, k, _records.length());
950 }
951 #endif
952
953 if (_textSize == (lUInt32)-1)
954 _textSize = unpoffset;
955 else if (unpoffset < _textSize) {
956 CRLog::warn("PDB: Unpacked text size is %d but expected %d", unpoffset, _textSize);
957 _textSize = unpoffset;
958 //return false; // text size does not match
959 }
960
961
962 _bufIndex = -1;
963 _bufSize = 0;
964 _bufOffset = 0;
965
966 SetName(_stream->GetName());
967 m_mode = LVOM_READ;
968
969
970 return true;
971 }
972
973 /// Seek (change file pos)
974 /**
975 \param offset is file offset (bytes) relateve to origin
976 \param origin is offset base
977 \param pNewPos points to place to store new file position
978 \return lverror_t status: LVERR_OK if success
979 */
Seek(lvoffset_t offset,lvseek_origin_t origin,lvpos_t * pNewPos)980 virtual lverror_t Seek( lvoffset_t offset, lvseek_origin_t origin, lvpos_t * pNewPos ) {
981 lvpos_t npos = 0;
982 lvpos_t currpos = _pos;
983 switch (origin) {
984 case LVSEEK_SET:
985 npos = offset;
986 break;
987 case LVSEEK_CUR:
988 npos = currpos + offset;
989 break;
990 case LVSEEK_END:
991 npos = _textSize + offset;
992 break;
993 }
994 if (npos > _textSize)
995 return LVERR_FAIL;
996 if (!seek(npos) )
997 return LVERR_FAIL;
998 if (pNewPos)
999 *pNewPos = _pos;
1000 return LVERR_OK;
1001 }
1002
1003 /// Get file position
1004 /**
1005 \return lvpos_t file position
1006 */
GetPos()1007 virtual lvpos_t GetPos()
1008 {
1009 return _pos;
1010 }
1011
1012 /// Get file size
1013 /**
1014 \return lvsize_t file size
1015 */
GetSize()1016 virtual lvsize_t GetSize()
1017 {
1018 return _textSize;
1019 }
1020
GetSize(lvsize_t * pSize)1021 virtual lverror_t GetSize( lvsize_t * pSize )
1022 {
1023 *pSize = _textSize;
1024 return LVERR_OK;
1025 }
1026
1027 /// Set file size
1028 /**
1029 \param size is new file size
1030 \return lverror_t status: LVERR_OK if success
1031 */
SetSize(lvsize_t size)1032 virtual lverror_t SetSize( lvsize_t size ) {
1033 CR_UNUSED(size);
1034 return LVERR_NOTIMPL;
1035 }
1036
1037 /// Read
1038 /**
1039 \param buf is buffer to place bytes read from stream
1040 \param count is number of bytes to read from stream
1041 \param nBytesRead is place to store real number of bytes read from stream
1042 \return lverror_t status: LVERR_OK if success
1043 */
Read(void * buf,lvsize_t count,lvsize_t * nBytesRead)1044 virtual lverror_t Read( void * buf, lvsize_t count, lvsize_t * nBytesRead ) {
1045 lvsize_t bytesRead = 0;
1046 if ( nBytesRead )
1047 *nBytesRead = bytesRead;
1048 lUInt8 * dst = (lUInt8 *)buf;
1049 while ( count > 0 ) {
1050 if ( ! seek(_pos) ) {
1051 if ( _pos>=_textSize )
1052 break;
1053 return LVERR_FAIL;
1054 }
1055 int bytesLeft = (int)(_bufOffset + _bufSize - _pos);
1056 if ( bytesLeft<=0 )
1057 break;
1058 int sz = count;
1059 if ( sz>bytesLeft )
1060 sz = bytesLeft;
1061 for ( int i=0; i<sz; i++ )
1062 dst[i] = _buf[_pos - _bufOffset + i];
1063 _pos += sz;
1064 dst += sz;
1065 count -= sz;
1066 bytesRead += sz;
1067 }
1068 if ( nBytesRead )
1069 *nBytesRead = bytesRead;
1070 return LVERR_OK;
1071 }
1072
1073 /// Write
1074 /**
1075 \param buf is data to write to stream
1076 \param count is number of bytes to write
1077 \param nBytesWritten is place to store real number of bytes written to stream
1078 \return lverror_t status: LVERR_OK if success
1079 */
Write(const void * buf,lvsize_t count,lvsize_t * nBytesWritten)1080 virtual lverror_t Write( const void * buf, lvsize_t count, lvsize_t * nBytesWritten ) {
1081 CR_UNUSED3(buf, count, nBytesWritten);
1082 return LVERR_NOTIMPL;
1083 }
1084
1085 /// Check whether end of file is reached
1086 /**
1087 \return true if end of file reached
1088 */
Eof()1089 virtual bool Eof() {
1090 return _pos>=_textSize;
1091 }
1092
getFormat()1093 Format getFormat() { return _format; }
1094
1095 /// Constructor
PDBFile()1096 PDBFile() {
1097 //_container.AddRef();
1098 _bufIndex = -1;
1099 _mobiExtraDataFlags = 0;
1100 m_doc_props = LVCreatePropsContainer();
1101 }
1102
1103 /// Destructor
~PDBFile()1104 virtual ~PDBFile() { }
1105
1106 };
1107
1108 // open PDB stream from stream
1109 //LVStreamRef LVOpenPDBStream( LVStreamRef srcstream, int &format )
1110 //{
1111 // PDBFile * stream = PDBFile::create( srcstream, format );
1112 // srcstream->SetPos(0);
1113 // if ( stream!=NULL )
1114 // {
1115 // return LVStreamRef( stream );
1116 // }
1117 // return LVStreamRef();
1118 //}
1119
DetectPDBFormat(LVStreamRef stream,doc_format_t & contentFormat)1120 bool DetectPDBFormat( LVStreamRef stream, doc_format_t & contentFormat )
1121 {
1122 PDBFile pdb;
1123 if ( !pdb.open(stream, NULL, false, contentFormat) )
1124 return false;
1125 return true;
1126 }
1127
isCorrectUtf8Text(LVStreamRef & stream)1128 bool isCorrectUtf8Text(LVStreamRef & stream) {
1129 char enc_name[32];
1130 char lang_name[32];
1131 lvpos_t oldpos = stream->GetPos();
1132 unsigned sz = 16384;
1133 stream->SetPos( 0 );
1134 if ( sz>stream->GetSize() )
1135 sz = stream->GetSize();
1136 if (sz < 8)
1137 return false;
1138 unsigned char * buf = new unsigned char[ sz ];
1139 lvsize_t bytesRead = 0;
1140 if ( stream->Read( buf, sz, &bytesRead )!=LVERR_OK ) {
1141 delete[] buf;
1142 stream->SetPos( oldpos );
1143 return false;
1144 }
1145
1146 int res = 0;
1147 res = AutodetectCodePageUtf(buf, sz, enc_name, lang_name);
1148 delete[] buf;
1149 return res != 0;
1150 }
1151
GetPDBCoverpage(LVStreamRef stream)1152 LVStreamRef GetPDBCoverpage(LVStreamRef stream)
1153 {
1154 doc_format_t contentFormat = doc_format_none;
1155 PDBFile * pdb = new PDBFile();
1156 LVPDBContainer * container = new LVPDBContainer();
1157 if (!pdb->open(stream, container, false, contentFormat)) {
1158 delete container;
1159 delete pdb;
1160 return LVStreamRef();
1161 }
1162 stream = LVStreamRef(pdb);
1163 LVContainerRef cnt(container);
1164 container->setStream(stream);
1165 LVStreamRef coverStream;
1166 lString32 coverName = pdb->getDocProps()->getStringDef(DOC_PROP_COVER_FILE);
1167 if (!coverName.empty()) {
1168 coverStream = cnt->OpenStream(coverName.c_str(), LVOM_READ);
1169 }
1170 if (!coverStream.isNull()) {
1171 CRLog::trace("Found PDB coverpage image");
1172 return LVCreateMemoryStream(coverStream);
1173 }
1174 return LVStreamRef();
1175 }
1176
ImportPDBDocument(LVStreamRef & stream,ldomDocument * doc,LVDocViewCallback * progressCallback,CacheLoadingCallback * formatCallback,doc_format_t & contentFormat)1177 bool ImportPDBDocument( LVStreamRef & stream, ldomDocument * doc, LVDocViewCallback * progressCallback, CacheLoadingCallback * formatCallback, doc_format_t & contentFormat )
1178 {
1179 contentFormat = doc_format_none;
1180 PDBFile * pdb = new PDBFile();
1181 LVPDBContainer * container = new LVPDBContainer();
1182 if ( !pdb->open(stream, container, true, contentFormat) ) {
1183 delete container;
1184 delete pdb;
1185 return false;
1186 }
1187 pdb->getDocProps()->set(doc->getProps());
1188 stream = LVStreamRef(pdb);
1189 container->setStream(stream);
1190 doc->setContainer(LVContainerRef(container));
1191
1192 #if BUILD_LITE!=1
1193 if ( doc->openFromCache(formatCallback) ) {
1194 if ( progressCallback ) {
1195 progressCallback->OnLoadFileEnd( );
1196 }
1197 return true;
1198 }
1199 #endif
1200 doc->getProps()->set(pdb->getDocProps());
1201
1202 switch ( contentFormat ) {
1203 case doc_format_html:
1204 // HTML
1205 {
1206
1207 ldomDocumentWriterFilter writerFilter(doc, false,
1208 HTML_AUTOCLOSE_TABLE);
1209 LVHTMLParser parser(stream, &writerFilter);
1210 parser.setProgressCallback(progressCallback);
1211 if ( !parser.CheckFormat() ) {
1212 return false;
1213 } else {
1214 if (pdb->getFormat()==PDBFile::MOBI && isCorrectUtf8Text(stream))
1215 parser.SetCharset(U"utf-8");
1216 if (!parser.Parse()) {
1217 return false;
1218 }
1219 }
1220 }
1221 break;
1222 default:
1223 //case doc_format_txt:
1224 // TXT
1225 {
1226 ldomDocumentWriter writer(doc);
1227 LVTextParser parser(stream, &writer, false);
1228 parser.setProgressCallback(progressCallback);
1229 if ( !parser.CheckFormat() ) {
1230 return false;
1231 } else {
1232 if (!parser.Parse()) {
1233 return false;
1234 }
1235 }
1236 }
1237 break;
1238 // PML
1239 {
1240 // ldomDocumentWriterFilter writerFilter(*doc, false,
1241 // HTML_AUTOCLOSE_TABLE);
1242
1243 // LVHTMLParser parser(m_stream, &writerFilter);
1244 // parser->setProgressCallback(progressCallback);
1245 // if ( !parser->CheckFormat() ) {
1246 // return false;
1247 // } else {
1248 // if (!parser->Parse()) {
1249 // return false;
1250 // }
1251 // }
1252 }
1253 break;
1254 }
1255 #ifdef DUMP_PDB_CONTENTS
1256 for (int i=0; i<container->GetObjectCount(); i++) {
1257 const LVContainerItemInfo * item = container->GetObjectInfo(i);
1258 if (item->IsContainer())
1259 continue;
1260 lString32 fn = item->GetName();
1261 if (fn.empty())
1262 fn = cs32("pdb_item_") + lString32::itoa(i);
1263 fn = cs32("/tmp/") + fn;
1264 LVStreamRef in = container->OpenStream(item->GetName(), LVOM_READ);
1265 if (in.isNull())
1266 continue;
1267 LVStreamRef out = LVOpenFileStream(fn.c_str(), LVOM_WRITE);
1268 if (out.isNull())
1269 continue;
1270 CRLog::trace("Dumping stream %s (%d)", LCSTR(fn), (int)item->GetSize());
1271 LVPumpStream(out.get(), in.get());
1272 }
1273 {
1274 LVStreamRef out = LVOpenFileStream("/tmp/pdb_main.txt", LVOM_WRITE);
1275 if (!out.isNull()) {
1276 stream->SetPos(0);
1277 CRLog::trace("Dumping stream /tmp/pdb_main.txt (%d)", (int)stream->GetSize());
1278 LVPumpStream(out.get(), stream.get());
1279 stream->SetPos(0);
1280 }
1281 }
1282 #endif
1283
1284 return true;
1285 }
1286