1 #include "../include/pdbfmt.h"
2 #include "../include/crlog.h"
3 #include <ctype.h>
4 
5 // uncomment following line to save PDB content streams to /tmp
6 //#define DUMP_PDB_CONTENTS
7 
8 struct PDBHdr
9 {
10     lUInt8    name[32];
11     lUInt16   attributes;
12     lUInt16   version;
13     lUInt32    creationDate;
14     lUInt32    modificationDate;
15     lUInt32    lastBackupDate;
16     lUInt32    modificationNumber;
17     lUInt32    appInfoID;
18     lUInt32    sortInfoID;
19     lUInt8     type[4];
20     lUInt8     creator[4];
21     lUInt32    uniqueIDSeed;
22     lUInt32    nextRecordList;
23     lUInt16    recordCount;
24     lUInt16    firstEntry;
readPDBHdr25     bool read( LVStreamRef stream ) {
26         // TODO: byte order support
27         lvsize_t bytesRead = 0;
28         if ( stream->Read(this, sizeof(PDBHdr), &bytesRead )!=LVERR_OK )
29             return false;
30         if ( bytesRead!=sizeof(PDBHdr) )
31             return false;
32         lvByteOrderConv cnv;
33         if ( cnv.lsf() )
34         {
35             cnv.rev(&attributes);
36             cnv.rev(&version);
37             cnv.rev(&creationDate);
38             cnv.rev(&modificationDate);
39             cnv.rev(&lastBackupDate);
40             cnv.rev(&modificationNumber);
41             cnv.rev(&appInfoID);
42             cnv.rev(&sortInfoID);
43             cnv.rev(&uniqueIDSeed);
44             cnv.rev(&nextRecordList);
45             cnv.rev(&recordCount);
46             cnv.rev(&firstEntry);
47         }
48         return true;
49     }
checkTypePDBHdr50     bool checkType( const char * str ) {
51         return type[0]==str[0] && type[1]==str[1] && type[2]==str[2] && type[3]==str[3];
52     }
53 
checkCreatorPDBHdr54     bool checkCreator( const char * str ) {
55         return creator[0]==str[0] && creator[1]==str[1] && creator[2]==str[2] && creator[3]==str[3];
56     }
57 };
58 
59 struct PDBRecordEntry
60 {
61     lUInt32 localChunkId;
62     lUInt8  attributes[4];
63     //lUInt8  uniqueID[3];
readPDBRecordEntry64     bool read( LVStreamRef stream ) {
65         // TODO: byte order support
66         lvsize_t bytesRead = 0;
67         if ( stream->Read(this, sizeof(PDBRecordEntry), &bytesRead )!=LVERR_OK )
68             return false;
69         if ( bytesRead!=sizeof(PDBRecordEntry) )
70             return false;
71         lvByteOrderConv cnv;
72         if ( cnv.lsf() )
73         {
74             cnv.rev(&localChunkId);
75         }
76         return true;
77     }
78 };
79 
80 struct PalmDocPreamble
81 {
82     lUInt16 compression; // 2  Compression   1 == no compression, 2 = PalmDOC compression (see below)
83     lUInt16 unused;      // 2  Unused  Always zero
84     lUInt32 textLength;  // 4  text length  Uncompressed length of the entire text of the book
85     lUInt16 recordCount; // 2  record count  Number of PDB records used for the text of the book.
86     lUInt16 recordSize;  // 2  record size  Maximum size of each record containing text, always 4096
readPalmDocPreamble87     bool read( LVStreamRef stream ) {
88         // TODO: byte order support
89         lvsize_t bytesRead = 0;
90         if ( stream->Read(this, sizeof(PalmDocPreamble), &bytesRead )!=LVERR_OK )
91             return false;
92         if ( bytesRead!=sizeof(PalmDocPreamble) )
93             return false;
94         lvByteOrderConv cnv;
95         if ( cnv.lsf() )
96         {
97             cnv.rev(&compression); // 2  Compression   1 == no compression, 2 = PalmDOC compression (see below)
98             cnv.rev(&textLength);  // 4  text length  Uncompressed length of the entire text of the book
99             cnv.rev(&recordCount); // 2  record count  Number of PDB records used for the text of the book.
100             cnv.rev(&recordSize);  // 2  record size  Maximum size of each record containing text, always 4096
101         }
102         if ( compression!=1 && compression!=2 )
103             return false;
104         return true;
105     }
106 };
107 
108 struct MobiPreamble : public PalmDocPreamble
109 {
110     lUInt16 mobiEncryption;  // 2  Encryption Type	0 == no encryption, 1 = Old Mobipocket Encryption, 2 = Mobipocket Encryption
111     lUInt16 unused2;     // 2  unknown, usually 0
112 
113     lUInt8  mobiSignature[4]; // 16	4	identifier	the characters M O B I
114     lUInt32 hederLength; // 20	4	header length	the length of the MOBI header, including the previous 4 bytes
115     lUInt32 mobiType;    //    24	4	Mobi type	The kind of Mobipocket file this is
116             //    2 Mobipocket Book
117             //    3 PalmDoc Book
118             //    4 Audio
119             //    257 News
120             //    258 News_Feed
121             //    259 News_Magazine
122             //    513 PICS
123             //    514 WORD
124             //    515 XLS
125             //    516 PPT
126             //    517 TEXT
127             //    518 HTML
128     lUInt32 encoding; //    28	4	text Encoding	1252 = CP1252 (WinLatin1); 65001 = UTF-8
129     lUInt32 uid; //    32	4	Unique-ID	Some kind of unique ID number (random?)
130     lUInt32 fileVersion; //    36	4	File version	Version of the Mobipocket format used in this file.
131     lUInt32 reserved[10]; //    40	40	Reserved	all 0xFF. In case of a dictionary, or some newer file formats, a few bytes are used from this range of 40 0xFFs
132     lUInt32 firstNonBookIndex; //    80	4	First Non-book index?	First record number (starting with 0) that's not the book's text
133     lUInt32 fullNameOffset; //    84	4	Full Name Offset	Offset in record 0 (not from start of file) of the full name of the book
134     lUInt32 fullNameLength; //    88	4	Full Name Length	Length in bytes of the full name of the book
135     lUInt32 locale; //    92	4	Locale	Book locale code. Low byte is main language 09= English, next byte is dialect, 08 = British, 04 = US. Thus US English is 1033, UK English is 2057.
136     lUInt32 inputLanguage; //    96	4	Input Language	Input language for a dictionary
137     lUInt32 outputLanguage; //    100	4	Output Language	Output language for a dictionary
138     lUInt32 minVersion; //    104	4	Min version	Minimum mobipocket version support needed to read this file.
139     lUInt32 firstImageIndex; //    108	4	First Image index?	First record number (starting with 0) that contains an image. Image records should be sequential.
140     lUInt32 huffmanRecordOffset; //    112	4	Huffman Record Offset	The record number of the first huffman compression record.
141     lUInt32 huffmanRecordCount; //    116	4	Huffman Record Count	The number of huffman compression records.
142     lUInt32 reserved2[2]; //    120	8	?	eight bytes, often zeros
143     lUInt32 mobiFlags; //    128	4	EXTH flags	bitfield. if bit 6 (0x40) is set, then there's an EXTH record
144     lUInt32 unknown3[8]; //    132	32	?	32 unknown bytes, if MOBI is long enough
145     lUInt32 drmOffset; //    164	4	DRM Offset	Offset to DRM key info in DRMed files. 0xFFFFFFFF if no DRM
146     lUInt32 drmCount; //    168	4	DRM Count	Number of entries in DRM info. 0xFFFFFFFF if no DRM
147     lUInt32 drmSize; //    172	4	DRM Size	Number of bytes in DRM info.
148     lUInt32 drmFlags; //    176	4	DRM Flags	Some flags concerning the DRM info.
149 
150 
readMobiPreamble151     bool read( LVStreamRef stream, lUInt16 & extraDataFlags ) {
152         extraDataFlags = 0;
153         lvsize_t bytesRead = 0;
154         if ( stream->Read(this, sizeof(MobiPreamble), &bytesRead )!=LVERR_OK )
155             return false;
156         if ( bytesRead!=sizeof(MobiPreamble) )
157             return false;
158         lvByteOrderConv cnv;
159         if ( cnv.lsf() )
160         {
161             cnv.rev(&compression); // 2  Compression   1 == no compression, 2 = PalmDOC compression (see below)
162             cnv.rev(&textLength);  // 4  text length  Uncompressed length of the entire text of the book
163             cnv.rev(&recordCount); // 2  record count  Number of PDB records used for the text of the book.
164             cnv.rev(&recordSize);  // 2  record size  Maximum size of each record containing text, always 4096
165             cnv.rev(&mobiEncryption);// 2  Encryption Type	0 == no encryption, 1 = Old Mobipocket Encryption, 2 = Mobipocket Encryption
166             cnv.rev(&hederLength); // 20	4	header length	the length of the MOBI header, including the previous 4 bytes
167             cnv.rev(&mobiType);    //    24	4	Mobi type	The kind of Mobipocket file this is
168             cnv.rev(&encoding); //    28	4	text Encoding	1252 = CP1252 (WinLatin1); 65001 = UTF-8
169             cnv.rev(&uid); //    32	4	Unique-ID	Some kind of unique ID number (random?)
170             cnv.rev(&fileVersion); //    36	4	File version	Version of the Mobipocket format used in this file.
171             cnv.rev(&firstNonBookIndex); //    80	4	First Non-book index?	First record number (starting with 0) that's not the book's text
172             cnv.rev(&fullNameOffset); //    84	4	Full Name Offset	Offset in record 0 (not from start of file) of the full name of the book
173             cnv.rev(&fullNameLength); //    88	4	Full Name Length	Length in bytes of the full name of the book
174             cnv.rev(&locale); //    92	4	Locale	Book locale code. Low byte is main language 09= English, next byte is dialect, 08 = British, 04 = US. Thus US English is 1033, UK English is 2057.
175             cnv.rev(&inputLanguage); //    96	4	Input Language	Input language for a dictionary
176             cnv.rev(&outputLanguage); //    100	4	Output Language	Output language for a dictionary
177             cnv.rev(&minVersion); //    104	4	Min version	Minimum mobipocket version support needed to read this file.
178             cnv.rev(&firstImageIndex); //    108	4	First Image index?	First record number (starting with 0) that contains an image. Image records should be sequential.
179             cnv.rev(&huffmanRecordOffset); //    112	4	Huffman Record Offset	The record number of the first huffman compression record.
180             cnv.rev(&huffmanRecordCount); //    116	4	Huffman Record Count	The number of huffman compression records.
181             cnv.rev(&mobiFlags); //    128	4	EXTH flags	bitfield. if bit 6 (0x40) is set, then there's an EXTH record
182             cnv.rev(&drmOffset); //    164	4	DRM Offset	Offset to DRM key info in DRMed files. 0xFFFFFFFF if no DRM
183             cnv.rev(&drmCount); //    168	4	DRM Count	Number of entries in DRM info. 0xFFFFFFFF if no DRM
184             cnv.rev(&drmSize); //    172	4	DRM Size	Number of bytes in DRM info.
185             cnv.rev(&drmFlags); //    176	4	DRM Flags	Some flags concerning the DRM info.
186         }
187         if ( compression!=1 && compression!=2 )
188             return false;
189         if ( mobiType!=2 && mobiType!=3 && mobiType!=517 && mobiType!=518
190                  && mobiType!=257 && mobiType!=258 && mobiType!=259 )
191             return false; // unsupported type
192         if ( mobiEncryption!=0 )
193             return false; // encryption is not supported
194         if ( hederLength >= 0xE4 ) {
195             stream->Seek(242-180, LVSEEK_CUR, NULL);
196             stream->Read(&extraDataFlags);
197             if ( cnv.lsf() )
198                 cnv.rev(&extraDataFlags);
199 //            if (extraDataFlags) {
200 //                CRLog::trace("extraDataFlags=%04x", (int)extraDataFlags);
201 //            }
202         }
203         return true;
204     }
205 };
206 
207 // format description from http://wiki.mobileread.com/wiki/EReader
208 struct EReaderHeader
209 {
210     lUInt16 compression;    //    0-2	compression	Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
211     lUInt16 unknown1[2];    //    2-6	unknown	Value of 0 is used
212     lUInt16 encoding;       //    6-8	encoding	Always 25152 (0x6240). All text must be encoded as Latin-1 cp1252
213     lUInt16 smallPageCount; //    8-10	Number of small pages	The number of small font pages. If page index is not build in then 0.
214     lUInt16 largePageCount; //    10-12	Number of large pages	The number of large font pages. If page index is not build in then 0.
215     lUInt16 nonTextRecordStart; //12-14	Non-Text record start	The location of the first non text records. record 1 to this value minus 1 are all text records
216     lUInt16 numberOfChapters;//    14-16	Number of chapters	The number of chapter index records contained in the file
217     lUInt16 smallPageRecordCount; //    16-18	Number of small index	The number of small font page index records contained in the file
218     lUInt16 largePageRecordCount; //    18-20	Number of large index	The number of large font page index records contained in the file
219     lUInt16 imageCount;        //    20-22	Number of images	The number of images contained in the file
220     lUInt16 linkCount;         //    22-24	Number of links	The number of links contained in the file
221     lUInt16 metadataAvailable; //    24-26	Metadata avaliable	Is there a metadata record in the file? 0 = None, 1 = There is a metadata record
222     lUInt16 unknown2; //    26-28	Unknown	Value of 0 is used
223     lUInt16 footnoteRecordsCount; //    28-30	Number of Footnotes	The number of footnote records in the file
224     lUInt16 sidebarRecordsCount; //    30-32	Number of Sidebars	The number of sidebar records in the file
225     lUInt16 chapterIndexStart; //    32-34	Chapter index record start	The location of chapter index records. If there are no chapters use the value for the Last data record.
226     lUInt16 unknown3; //    34-36	2560	Magic value that must be set to 2560
227     lUInt16 smallPageIndexStart; //    36-38	Small page index start	The location of small font page index records. If page table is not built in use the value for the Last data record.
228     lUInt16 largePageIndexStart; //    38-40	Large page index start	The location of large font page index records. If page table is not built in use the value for the Last data record.
229     lUInt16 imageDataRecordStart; //    40-42	Image data record start	The location of the first image record. If there are no images use the value for the Last data record.
230     lUInt16 linksRecordStart; //    42-44	Links record start	The location of the first link index record. If there are no links use the value for the Last data record.
231     lUInt16 metadataRecordStart; //    44-46	Metadata record start	The location of the metadata record. If there is no metadata use the value for the Last data record.
232     lUInt16 unknown4; //    46-48	Unknown	Value of 0 is used
233     lUInt16 footnoteRecordStart; //    48-50	Footnote record start	The location of the first footnote record. If there are no footnotes use the value for the Last data record.
234     lUInt16 sidebarRecordStart; //    50-52	Sidebar record start	The location of the first sidebar record. If there are no sidebars use the value for the Last data record.
235     lUInt16 lastDataRecord; //    52-54	Last data record	The location of the last data record
236     lUInt16 unknown5[39]; //    54-132	Unknown	Value of 0 is used
readEReaderHeader237     bool read( LVStreamRef stream ) {
238         lvsize_t bytesRead = 0;
239         if ( stream->Read(this, sizeof(EReaderHeader), &bytesRead )!=LVERR_OK )
240             return false;
241         if ( bytesRead!=sizeof(EReaderHeader) )
242             return false;
243         lvByteOrderConv cnv;
244         if ( cnv.lsf() )
245         {
246             cnv.rev(&compression);    //    0-2	compression	Specifies compression and drm. 2 = palmdoc, 10 = zlib. 260 and 272 = DRM
247             cnv.rev(&encoding);       //    6-8	encoding	Always 25152 (0x6240). All text must be encoded as Latin-1 cp1252
248             cnv.rev(&smallPageCount); //    8-10	Number of small pages	The number of small font pages. If page index is not build in then 0.
249             cnv.rev(&largePageCount); //    10-12	Number of large pages	The number of large font pages. If page index is not build in then 0.
250             cnv.rev(&nonTextRecordStart); //12-14	Non-Text record start	The location of the first non text records. record 1 to this value minus 1 are all text records
251             cnv.rev(&numberOfChapters);//    14-16	Number of chapters	The number of chapter index records contained in the file
252             cnv.rev(&smallPageRecordCount); //    16-18	Number of small index	The number of small font page index records contained in the file
253             cnv.rev(&largePageRecordCount); //    18-20	Number of large index	The number of large font page index records contained in the file
254             cnv.rev(&imageCount);        //    20-22	Number of images	The number of images contained in the file
255             cnv.rev(&linkCount);         //    22-24	Number of links	The number of links contained in the file
256             cnv.rev(&metadataAvailable); //    24-26	Metadata avaliable	Is there a metadata record in the file? 0 = None, 1 = There is a metadata record
257             cnv.rev(&footnoteRecordsCount); //    28-30	Number of Footnotes	The number of footnote records in the file
258             cnv.rev(&sidebarRecordsCount); //    30-32	Number of Sidebars	The number of sidebar records in the file
259             cnv.rev(&chapterIndexStart); //    32-34	Chapter index record start	The location of chapter index records. If there are no chapters use the value for the Last data record.
260             cnv.rev(&smallPageIndexStart); //    36-38	Small page index start	The location of small font page index records. If page table is not built in use the value for the Last data record.
261             cnv.rev(&largePageIndexStart); //    38-40	Large page index start	The location of large font page index records. If page table is not built in use the value for the Last data record.
262             cnv.rev(&imageDataRecordStart); //    40-42	Image data record start	The location of the first image record. If there are no images use the value for the Last data record.
263             cnv.rev(&linksRecordStart); //    42-44	Links record start	The location of the first link index record. If there are no links use the value for the Last data record.
264             cnv.rev(&metadataRecordStart); //    44-46	Metadata record start	The location of the metadata record. If there is no metadata use the value for the Last data record.
265             cnv.rev(&footnoteRecordStart); //    48-50	Footnote record start	The location of the first footnote record. If there are no footnotes use the value for the Last data record.
266             cnv.rev(&sidebarRecordStart); //    50-52	Sidebar record start	The location of the first sidebar record. If there are no sidebars use the value for the Last data record.
267             cnv.rev(&lastDataRecord); //    52-54	Last data record	The location of the last data record
268         }
269         if ( compression!=1 && compression!=2 && compression!=10 )
270             return false;
271         return true;
272     }
273 };
274 
275 struct PluckerPreamble {
276     lUInt32 signature; // 	4 	Numeric 	Must contain the value 0x6C6E6368.
277     lUInt16 hdrVersion; // 	2 	Numeric 	Must have the value 3.
278     lUInt16 hdrEncoding; // 	2 	Numeric 	Must have the value 0.
279     lUInt16 verStrWords; // 	2 	Numeric 	The number of two-byte words following, containing the version string.
280 //    char  	2 * verStrWords 	String 	NUL-terminated ISO Latin-1 string, padded at end if necessary with a zero byte to an even-byte boundary, containing a version string to display to the user containing version information for the document.
281 //    pqaTitleWords 	2 	Numeric 	The number of two-byte words in the following pqaTitleStr.
282 //    pqaTitleStr 	2 * pqaTitleWords 	String 	NUL-terminated ISO Latin-1 string, padded at end if necessary with a zero byte to an even-byte boundary, containing a title string for iconic display of the document.
283 //    iconWords 	2 	Numeric 	Number of two-byte words in the following icon image.
284 //    icon 	2 * iconWords 	Image 	Image (32x32) in Palm image format to be used as an icon to represent the document on a desktop-style display. The image may not use a custom color map.
285 //    smIconWords 	2 	Numeric 	Number of two-byte words in the following icon image.
286 //    smIcon 	2 * smIconWords 	Image 	Small image (15x9) in Palm image format to be used as an icon to represent the document on a desktop-style display. The image may not use a custom color map.
287 };
288 
289 /// unpack data from _compbuf to _buf
290 bool ldomUnpack( const lUInt8 * compbuf, int compsize, lUInt8 * &dstbuf, lUInt32 & dstsize  );
291 
292 class PDBFile;
293 
294 class LVPDBContainerItem : public LVContainerItemInfo {
295 protected:
296     LVStreamRef _stream;
297     PDBFile * _file;
298     int _startBlock;
299     int _size;
300     lString32 _name;
301 public:
302     /// returns object size (file size or directory entry count)
GetSize(lvsize_t * pSize)303     virtual lverror_t GetSize( lvsize_t * pSize ) {
304         *pSize = _size;
305 		return LVERR_OK;
306     }
GetSize() const307     virtual lvsize_t        GetSize() const { return _size; }
GetName() const308     virtual const lChar32 * GetName() const { return _name.c_str(); }
GetFlags() const309     virtual lUInt32         GetFlags() const { return 0; }
IsContainer() const310     virtual bool            IsContainer() const { return false; }
openStream()311     virtual LVStreamRef openStream() {
312         // TODO: implement stream creation
313         return LVStreamRef();
314     }
LVPDBContainerItem(LVStreamRef stream,PDBFile * file,lString32 name,int startBlockIndex,int size)315     LVPDBContainerItem( LVStreamRef stream, PDBFile * file, lString32 name, int startBlockIndex, int size )
316         : _stream(stream), _file(file), _startBlock(startBlockIndex), _size(size), _name(name) {
317     }
318 };
319 
320 class LVPDBRegionContainerItem : public LVPDBContainerItem {
321 public:
322     /// returns object size (file size or directory entry count)
GetFlags() const323     virtual lUInt32         GetFlags() const { return 0; }
openStream()324     virtual LVStreamRef openStream() {
325         // return region of base stream
326         return LVStreamRef( new LVStreamFragment( _stream, _startBlock, _size ) );
327     }
LVPDBRegionContainerItem(LVStreamRef stream,PDBFile * file,lString32 name,int startOffset,int size)328     LVPDBRegionContainerItem( LVStreamRef stream, PDBFile * file, lString32 name, int startOffset, int size )
329         : LVPDBContainerItem(stream, file, name, startOffset, size) {
330     }
331 };
332 
333 class LVPDBContainer : public LVContainer
334 {
335     LVPtrVector<LVPDBContainerItem> _list;
336     LVStreamRef _stream;
337 public:
GetParentContainer()338     virtual LVContainer * GetParentContainer() { return NULL; }
339 
addItem(LVPDBContainerItem * item)340     void addItem ( LVPDBContainerItem * item ) {
341         _list.add(item);
342     }
343 
344     //virtual const LVContainerItemInfo * GetObjectInfo(const lChar32 * pname);
GetObjectInfo(int index)345     virtual const LVContainerItemInfo * GetObjectInfo(int index) {
346         if ( index>=0 && index<_list.length() )
347             return _list[index];
348 		return NULL;
349     }
GetObjectCount() const350     virtual int GetObjectCount() const { return _list.length(); }
351     /// returns object size (file size or directory entry count)
GetSize(lvsize_t * pSize)352     virtual lverror_t GetSize( lvsize_t * pSize ) {
353         *pSize = _list.length();
354 		return LVERR_OK;
355     }
356 
OpenStream(const lChar32 * fname,lvopen_mode_t mode)357     virtual LVStreamRef OpenStream( const lChar32 * fname, lvopen_mode_t mode ) {
358         if ( mode!=LVOM_READ )
359             return LVStreamRef();
360         for ( int i=0; i<_list.length(); i++ ) {
361             //CRLog::trace("OpenStream(%s) : %s", LCSTR(lString32(fname)), LCSTR(lString32(_list[i]->GetName())) );
362             if ( !lStr_cmp(_list[i]->GetName(), fname) )
363                 return _list[i]->openStream();
364         }
365         return LVStreamRef();
366     }
367 
setStream(LVStreamRef stream)368     void setStream( LVStreamRef stream ) {
369         _stream = stream;
370     }
371 
LVPDBContainer()372     LVPDBContainer( ) {
373         //_contentStream = LVStreamRef((LVStream*)file);
374     }
~LVPDBContainer()375     virtual ~LVPDBContainer() { }
376 };
377 
pattern_cmp(const lUInt8 * buf,const char * pattern)378 static bool pattern_cmp( const lUInt8 * buf, const char * pattern ) {
379     for ( int i=0; pattern[i]; i++ )
380         if ( tolower(buf[i])!=pattern[i] )
381             return false;
382     return true;
383 }
384 
385 class PDBFile : public LVNamedStream {
386 public:
387     enum Format {
388         UNKNOWN,
389         PALMDOC,
390         EREADER,
391         PLUCKER,
392         MOBI
393     };
394 private:
395 
396     struct Record {
397         lUInt32 offset;
398         lUInt32 size;
399         lUInt32 unpoffset;
400         lUInt32 unpsize;
401     };
402     LVArray<Record> _records;
403     LVStreamRef _stream;
404     Format _format;
405     int _compression;
406     lUInt32 _textSize;
407     int _recordCount; // text record count
408     // read buffer
409     LVArray<lUInt8> _buf;
410     int     _bufIndex;
411     lvpos_t _bufOffset;
412     lvsize_t _bufSize;
413     lvpos_t _pos;
414     lUInt16 _mobiExtraDataFlags;
415     CRPropRef m_doc_props;
416     //LVPDBContainer * _container;
unpack(LVArray<lUInt8> & dst,LVArray<lUInt8> & src)417     bool unpack( LVArray<lUInt8> & dst, LVArray<lUInt8> & src ) {
418         int srclen = src.length();
419         dst.reset();
420         dst.reserve(srclen);
421 
422         if ( _compression==2 ) {
423             // PalmDOC
424             int pos = 0;
425             lInt32 b;
426 
427             while (pos<srclen) {
428                 b = src[pos];
429                 pos++;
430                 if (b > 0 && b < 9) {
431                     // 1..8 bytes follow
432                     if (pos + b > srclen)
433                         break;
434                     for (int i=0; i<(int)b; i++)
435                         dst.add(src[pos++]);
436                 } else if (b < 128) {
437                     // unmodified single byte
438                     dst.add((lUInt8)b);
439                 } else if (b >= 0xc0) {
440                     dst.add(' ');
441                     dst.add(b & 0x7f);
442                 } else {
443                     if (pos >= srclen)
444                         break;
445                     lUInt32 z = ((b & 0x3f) << 8) + src[pos];
446                     pos++;
447                     int offset = z >> 3;
448                     int size = (z & 7) + 3;
449                     int srcpos = dst.length() - offset;
450                     for (int i = 0; i < size; i++) {
451                         if (srcpos >= 0) {
452                             dst.add(dst[srcpos++]);
453                         } else {
454                             dst.add('?');
455                             //CRLog::trace("wrong offset");
456                         }
457                     }
458                 }
459             }
460         } else if ( _compression==10 ) {
461             // zlib
462             /// unpack data from _compbuf to _buf
463             lUInt8 * dstbuf;
464             lUInt32 dstsize;
465             if ( !ldomUnpack( src.get(), src.size(), dstbuf, dstsize ) )
466                 return false;
467             dst.add(dstbuf, dstsize);
468             free(dstbuf);
469         } else if ( _compression==17480 ) {
470             // zlib
471             // TODO: shouldn't it be HUFFMAN unpacker?
472             /// unpack data from _compbuf to _buf
473             lUInt8 * dstbuf;
474             lUInt32 dstsize;
475             if ( !ldomUnpack( src.get(), src.size(), dstbuf, dstsize ) )
476                 return false;
477             dst.add(dstbuf, dstsize);
478             free(dstbuf);
479         }
480         return true;
481     }
482 
removeExtraData(int index,LVArray<lUInt8> & buf)483     void removeExtraData(int index, LVArray<lUInt8> & buf) {
484         if (index >= _records.length() || !_mobiExtraDataFlags)
485             return;
486         for (int flag = 0x8000; flag; flag >>= 1) {
487             if (!(_mobiExtraDataFlags & flag))
488                 continue;
489             lInt32 n = buf[buf.length()-1];
490             if (flag == 1) {
491                 n &= 3;
492 
493                 _records[index].size -= 1;
494                 buf.erase(buf.length()-1, 1);
495 
496                 if (n>0) {
497                     //CRLog::trace("block %d: removing %d bytes of multibyte character", index, n);
498 
499                     for (int i=n; i>0; i--) {
500                         n = buf[buf.length() - 1];
501                         if (!(n & 0x80))
502                             break;
503                         buf.erase(buf.length() - 1, 1);
504                         if ((n & 0xC0) != 0x80)
505                             break;
506                     }
507                 }
508 
509             } else {
510                 if (!(n & 0x80)) {
511                     lUInt32 n2 = buf[buf.length()-2];
512                     n = (n & 0x7F) | ((n2 & 0x7F) << 16);
513                 } else {
514                     n = n & 0x7F;
515                 }
516                 if (n > 0 && buf.length() >= n) {
517                     //CRLog::trace("block %d: removing %d bytes of extra data type %d", index, n, flag);
518                     _records[index].size -= n;
519                     buf.erase(buf.length()-n, n);
520                 }
521             }
522 //            if (n && buf.length() >= n) {
523 //                _records[index].size -= n;
524 //                buf.erase(buf.length()-n, n);
525 
526 //                if (flag == 1 && n > 1) {
527 //                    CRLog::trace("block %d: removing %d bytes of multibyte character", index, n - 1);
528 //                    // remove extra utf-8 points
529 //                    while (buf.length()) {
530 //                        n = buf[buf.length() - 1];
531 //                        if (!(n & 0x80))
532 //                            break;
533 //                        buf.erase(buf.length() - 1, 1);
534 //                        if ((n & 0xC0) != 0x80)
535 //                            break;
536 //                    }
537 //                }
538 //            }
539         }
540     }
541 
readRecordNoUnpack(int index,LVArray<lUInt8> * dstbuf)542     bool readRecordNoUnpack(int index, LVArray<lUInt8> * dstbuf) {
543         if (index >= _records.length())
544             return false;
545         dstbuf->reset();
546         dstbuf->addSpace(_records[index].size);
547         lvsize_t bytesRead = 0;
548         _stream->SetPos(_records[index].offset);
549         if (_stream->Read(dstbuf->get(), _records[index].size, &bytesRead) != LVERR_OK)
550             return false;
551         if (bytesRead != _records[index].size)
552             return false;
553         return true;
554     }
readRecord(int index,LVArray<lUInt8> * dstbuf)555     bool readRecord( int index, LVArray<lUInt8> * dstbuf ) {
556         if (index >= _records.length())
557             return false;
558         LVArray<lUInt8> srcbuf;
559         LVArray<lUInt8> * buf = _compression ? &srcbuf : dstbuf;
560         if ( buf->empty() )
561             buf->reserve(1); // ensure buf->_array is no more NULL for _stream->Read(dstbuf->get() above
562         if (!readRecordNoUnpack(index, buf))
563             return false;
564 
565         if (_mobiExtraDataFlags && index < _recordCount)
566             removeExtraData(index, *buf);
567 
568         if (!_compression)
569             return true;
570         // unpack
571         return unpack(*dstbuf, srcbuf);
572     }
573 
readBlock(int index)574     bool readBlock( int index ) {
575         if ( index<0 || index>=_recordCount )
576             return false;
577         if ( index==_bufIndex )
578             return true; // already read
579         bool res = readRecord( index+1, &_buf );
580         if ( !res )
581             return false;
582         _bufIndex = index;
583         _bufOffset = _records[index+1].unpoffset;
584         _bufSize = _records[index+1].unpsize;
585         return true;
586     }
587 
findBlock(lvpos_t pos)588     int findBlock( lvpos_t pos ) {
589         if ( pos==_textSize )
590             return _recordCount-1;
591         for ( int i=0; i<_recordCount; i++ ) {
592             if ( pos>=_records[i+1].unpoffset && pos<_records[i+1].unpoffset+_records[i+1].unpsize )
593                 return i;
594         }
595         return -1;
596     }
597 
seek(lvpos_t pos)598     bool seek( lvpos_t pos ) {
599         int index = findBlock(pos);
600         if ( index<0 )
601             return false;
602         bool res = readBlock( index );
603         if ( !res )
604             return false;
605         _pos = pos;
606         return true;
607     }
608 
609 public:
610 
611 //    LVContainerRef getContainer() {
612 //        if ( !_container )
613 //            _container = new LVPDBContainer();
614 //        return LVContainerRef(&_container);
615 //    }
616 
617 
618 //    static PDBFile * create( LVStreamRef stream, int & format ) {
619 //        format = 0;
620 //        PDBFile * res = new PDBFile();
621 //        if ( res->open(stream, true, format) ) {
622 //            format = res->_format;
623 //            return res;
624 //        }
625 //        delete res;
626 //        return NULL;
627 //    }
628 
detectFormat(doc_format_t & contentFormat)629     void detectFormat( doc_format_t & contentFormat ) {
630         if ( contentFormat == doc_format_none ) {
631             // autodetect format
632             LVArray<lUInt8> buf;
633             readRecord(1, &buf);
634             int bytesRead = buf.length();
635             if ( bytesRead>0 ) {
636                 int pmlCount = 0;
637                 int htmlCount = 0;
638                 lString32 pmlChars("pXxCcriuovtnsblaUBSmqQI");
639                 for ( int i=0; i<bytesRead-10; i++ ) {
640                     const lUInt8 * p = buf.get() + i;
641                     if ( p[0]=='\\' ) {
642                         if ( pmlChars.pos(lString32((const lChar8 *)p+1, 1)) >=0 )
643                             pmlCount++;
644                     } else if (p[0]=='<') {
645                         if ( pattern_cmp(p+1, "html") )
646                             htmlCount+=100;
647                         if ( pattern_cmp(p+1, "head") )
648                             htmlCount+=50;
649                         if ( pattern_cmp(p+1, "body") )
650                             htmlCount+=50;
651                         if ( pattern_cmp(p+1, "h1") || pattern_cmp(p+1, "h2") || pattern_cmp(p+1, "h3") || pattern_cmp(p+1, "h4"))
652                             htmlCount+=5;
653                         if ( pattern_cmp(p+1, "p>") || pattern_cmp(p+1, "b>") || pattern_cmp(p+1, "i>") || pattern_cmp(p+1, "li>") || pattern_cmp(p+1, "ul>"))
654                             htmlCount+=10;
655                     }
656                 }
657                 if ( pmlCount<5 && htmlCount<10 ) {
658                     contentFormat = doc_format_txt;
659                 } else if ( pmlCount > htmlCount ) {
660                     contentFormat = doc_format_fb2;
661                 } else {
662                     contentFormat = doc_format_html;
663                 }
664             }
665             SetPos(0);
666         }
667     }
668 
getDocProps()669     CRPropRef getDocProps() {
670         return m_doc_props;
671     }
672 
open(LVStreamRef stream,LVPDBContainer * container,bool validateContent,doc_format_t & contentFormat)673     bool open( LVStreamRef stream, LVPDBContainer * container, bool validateContent, doc_format_t & contentFormat ) {
674         contentFormat = doc_format_none;
675         _format = UNKNOWN;
676         stream->SetPos(0);
677         lUInt32 fsize = stream->GetSize();
678         PDBHdr hdr = { 0 };
679         PDBRecordEntry entry;
680         if ( !hdr.read(stream) )
681             return false;
682         if ( hdr.recordCount==0 )
683             return 0;
684 
685         if ( hdr.checkType("TEXt") && hdr.checkCreator("REAd") )
686             _format = PALMDOC;
687         if ( hdr.checkType("PNRd") && hdr.checkCreator("PPrs") )
688             _format = EREADER;
689         if ( hdr.checkType("BOOK") && hdr.checkCreator("MOBI") )
690             _format = MOBI;
691         if ( hdr.checkType("Data") && hdr.checkCreator("Plkr") )
692             _format = PLUCKER;
693 //        if ( hdr.checkType("ToGo") && hdr.checkCreator("ToGo") )
694 //            _format = ISILO;
695         if ( _format==UNKNOWN )
696             return false; // UNKNOWN FORMAT
697 
698         stream->SetPos(0x4E);
699         lUInt32 lastEntryStart = 0;
700         _records.addSpace(hdr.recordCount);
701         for ( int i=0; i<hdr.recordCount; i++ ) {
702             if ( !entry.read(stream) )
703                 return false;
704             lUInt32 pos = entry.localChunkId;
705             if ( pos<lastEntryStart || pos>=fsize )
706                 return false;
707             _records[i].offset = pos;
708             if ( i>0 )
709                 _records[i-1].size = pos - _records[i-1].offset;
710             lastEntryStart = pos;
711         }
712         _records[_records.length()-1].size = fsize - _records[_records.length()-1].offset;
713 
714 
715         _stream = stream;
716 
717         if ( _format==EREADER ) {
718             if ( _records[0].size<sizeof(EReaderHeader) )
719                 return false;
720             EReaderHeader preamble = { 0 };
721             stream->SetPos(_records[0].offset);
722             if ( !preamble.read(stream) )
723                 return false; // invalid preamble
724             _recordCount = preamble.nonTextRecordStart - 1;
725             if ( _recordCount>=_records.length() )
726                 return false;
727             _compression = preamble.compression;
728             if ( _compression==1 )
729                 _compression = 0;
730             _textSize = (lUInt32)-1;
731             if ( preamble.imageCount && container ) {
732                 for ( int index=preamble.imageDataRecordStart; index<preamble.imageDataRecordStart+preamble.imageCount; index++ ) {
733                     lUInt32 start = _records[index].offset + 62;
734                     lUInt32 size = _records[index].size - 62;
735                     if ( start<fsize && start+size<=fsize ) {
736                         stream->SetPos(_records[index].offset);
737                         if ( stream->ReadByte()=='P' && stream->ReadByte()=='N' && stream->ReadByte()=='G' && stream->ReadByte()==' ' ) {
738                             // header ok, adding item
739                             char name[33] = { 0 };
740                             lvsize_t bytesRead = 0;
741                             stream->Read(name, 32, &bytesRead);
742                             if ( name[0] ) {
743                                 lString32 fname = lString32(name);
744                                 container->addItem( new LVPDBRegionContainerItem( stream, this, fname, start, size ) );
745                             }
746                         }
747                     }
748                 }
749             }
750         } else if (_format==MOBI ) {
751             if ( _records[0].size<sizeof(MobiPreamble) )
752                 return false;
753             if (!validateContent)
754                 contentFormat = doc_format_pdb;
755 
756             MobiPreamble preamble = {};
757             stream->SetPos(_records[0].offset);
758             if ( !preamble.read(stream, _mobiExtraDataFlags) )
759                 return false; // invalid preamble
760             if ( preamble.recordCount>=_records.length() )
761                 return false;
762             _compression = preamble.compression;
763             if ( _compression==1 )
764                 _compression = 0;
765             _textSize = preamble.textLength;
766             _recordCount = preamble.firstNonBookIndex - 1;
767             lUInt32 coverOffset = (lUInt32)-1;
768             lUInt32 thumbOffset = 0;
769             if (preamble.mobiFlags & 0x40) {
770                 // EXTH present
771                 stream->SetPos(_records[0].offset + 16 + preamble.hederLength);
772                 char exth_tag[4] = {0, 0, 0, 0};
773                 stream->Read(&exth_tag, 4, NULL);
774                 if (exth_tag[0] == 'E' && exth_tag[1] == 'X' && exth_tag[2] == 'T' && exth_tag[3] == 'H') {
775                 	CRLog::trace("EXTH record found");
776                     lUInt32 hdrLen = 0;
777                     lUInt32 recCount = 0;
778                     lvByteOrderConv cnv;
779                     stream->Read(&hdrLen);
780                     stream->Read(&recCount);
781                     if ( cnv.lsf() ) {
782                         cnv.rev(&hdrLen);
783                         cnv.rev(&recCount);
784                     }
785                     LVArray<lUInt8> buf2;
786                     for (lUInt32 i=0; i<recCount; i++) {
787                         lUInt32 recType = 0;
788                         lUInt32 recLen = 0;
789                         stream->Read(&recType);
790                         stream->Read(&recLen);
791                         if ( cnv.lsf() ) {
792                             cnv.rev(&recType);
793                             cnv.rev(&recLen);
794                         }
795                         buf2.reset();
796                         if (recLen > 8) {
797                             lvpos_t nextPos = stream->GetPos() + recLen - 8;
798                             //================================
799                             if (recLen == 12 && recType == 201) {
800                                 stream->Read(&coverOffset);
801                                 cnv.msf(&coverOffset);
802                             } else if (recLen == 12 && recType == 202) {
803                                 stream->Read(&thumbOffset);
804                                 cnv.msf(&thumbOffset);
805                             } else {
806                                 buf2.addSpace(recLen);
807                                 if (stream->Read(buf2.get(), recLen - 8, NULL) != LVERR_OK)
808                                     break;
809                                 if (recType == 100) {
810                                     lString8 author((const char *)buf2.get(), recLen - 8);
811                                     CRLog::trace("MOBI author: %s", author.c_str());
812                                     m_doc_props->setString(DOC_PROP_AUTHORS, Utf8ToUnicode(author));
813                                 } else if (recType == 105) {
814                                     lString8 s((const char *)buf2.get(), recLen - 8);
815                                     CRLog::trace("MOBI subject: %s", s.c_str());
816                                     m_doc_props->setString(DOC_PROP_TITLE, Utf8ToUnicode(s));
817                                 }
818                             }
819                             //================================
820                             stream->SetPos(nextPos);
821                         }
822                     }
823                 }
824             }
825             if (container) {
826                 for ( int index=preamble.firstImageIndex; index<_records.length(); index++ ) {
827                     stream->SetPos(_records[index].offset);
828                     lUInt8 buf[256];
829                     stream->Read(buf, 16, NULL);
830                     //CRLog::debug("Image record %d [%02x %02x %02x %02x %02x]", index, buf[0], buf[1], buf[2], buf[3], buf[4]);
831                     const char * fmt = NULL;
832                     if (buf[0]==0xff && buf[1]==0xd8 && buf[2]==0xFF && buf[3]==0xe0)
833                         fmt = "jpeg";
834                     if (buf[0]==0x89 && buf[1]=='P' && buf[2]=='N' && buf[3]=='G')
835                         fmt = "png";
836                     if (buf[0]=='G' && buf[1]=='I' && buf[2]=='F')
837                         fmt = "gif";
838                     if (fmt) {
839                         lString32 name = lString32(MOBI_IMAGE_NAME_PREFIX) + fmt::decimal((int) (index - preamble.firstImageIndex + 1));
840                         //CRLog::debug("Adding image %s [%d] %s", LCSTR(name), _records[index].size, fmt);
841                         container->addItem( new LVPDBRegionContainerItem( stream, this, name, _records[index].offset, _records[index].size ) );
842                         if ((unsigned)index == preamble.firstImageIndex + coverOffset) {
843                             m_doc_props->setString(DOC_PROP_COVER_FILE, name);
844                             CRLog::trace("MOBI COVER: %s", LCSTR(name));
845                         }
846                     }
847                 }
848             }
849         } else if (_format==PALMDOC ) {
850             if ( _records[0].size<sizeof(PalmDocPreamble) )
851                 return false;
852             PalmDocPreamble preamble = { 0 };
853             stream->SetPos(_records[0].offset);
854             if ( !preamble.read(stream) )
855                 return false; // invalid preamble
856             if ( preamble.recordCount>=_records.length() )
857                 return false;
858             _compression = preamble.compression;
859             if ( _compression==1 )
860                 _compression = 0;
861             _textSize = preamble.textLength;
862             _recordCount = preamble.recordCount;
863         } else if (_format==PLUCKER ) {
864             // TODO
865             return false;
866         }
867 
868 //        if (_mobiExtraDataFlag) {
869 //            // remove extra data
870 //            for ( int k=1; k<_recordCount; k++ )
871 //                _records[k+1].size -= 6;
872 //        }
873 
874 //#ifdef DUMP_PDB_CONTENTS
875 //        int unpoffset2 = 0;
876 //        FILE * out = fopen("/tmp/pdbout.txt", "wb");
877 //        int k;
878 //        for (k=1; k <= _recordCount && unpoffset2 < this->_textSize; k++) {
879 //            LVArray<lUInt8> dst;
880 //            readRecordNoUnpack(k, &_buf);
881 //            if (_mobiExtraDataFlags) {
882 //                removeExtraData(k, _buf);
883 ////                    int b = _buf[_buf.length()-1];
884 ////                    CRLog::trace("Extra data: %d bytes", b);
885 ////                    _records[k].size -= b;
886 ////                    _buf.erase(_buf.length()-1-b, b);
887 //            }
888 //            if (_compression == 2) {
889 //                unpack(dst, _buf);
890 //                _records[k].unpoffset = unpoffset2;
891 //                _records[k].unpsize = dst.length();
892 //                unpoffset2 += dst.length();
893 //                fwrite(dst.get(), dst.length(), 1, out);
894 //                fprintf(out, "\n[block %d end]\n", k);
895 //            }
896 //            CRLog::trace("record[%d] : %06x %06x -  %06x %06x", k, _records[k].offset, _records[k].size, _records[k].unpoffset, _records[k].unpsize);
897 //        }
898 //        fclose(out);
899 //        CRLog::trace("totalUncompSizeHdr=%06x realUncompSize=%06x %d blocks of %d", this->_textSize, unpoffset2, k, _records.length());
900 //#endif
901 
902         if ( !validateContent )
903             return true; // for simple format check
904 
905         LVArray<lUInt8> buf;
906         lUInt32 unpoffset = 0;
907         _crc = 0;
908         for ( int k=0; k<_recordCount; k++ ) {
909 
910             readRecord(k+1, &buf);
911             _records[k+1].unpoffset = unpoffset;
912             _records[k+1].unpsize = buf.length();
913             unpoffset += buf.length();
914             _crc = lStr_crc32( _crc, buf.get(), buf.length() );
915         }
916         _mobiExtraDataFlags = 0;
917 
918 
919         detectFormat( contentFormat );
920 
921 
922 
923         #ifdef DUMP_PDB_CONTENTS
924         {
925                 int unpoffset2 = 0;
926                 FILE * out = fopen("/tmp/pdbout.txt", "wb");
927                 int k;
928                 for (k=1; k <= _recordCount && unpoffset2 < this->_textSize; k++) {
929                     LVArray<lUInt8> dst;
930                     readRecordNoUnpack(k, &_buf);
931 //                    if (_mobiExtraDataFlags) {
932 //                        removeExtraData(k, _buf);
933         //                    int b = _buf[_buf.length()-1];
934         //                    CRLog::trace("Extra data: %d bytes", b);
935         //                    _records[k].size -= b;
936         //                    _buf.erase(_buf.length()-1-b, b);
937 //                    }
938                     if (_compression == 2) {
939                         unpack(dst, _buf);
940                         _records[k].unpoffset = unpoffset2;
941                         _records[k].unpsize = dst.length();
942                         unpoffset2 += dst.length();
943                         fwrite(dst.get(), dst.length(), 1, out);
944                         fprintf(out, "\n[block %d end]\n", k);
945                     }
946                     CRLog::trace("record[%d] : %06x %06x -  %06x %06x", k, _records[k].offset, _records[k].size, _records[k].unpoffset, _records[k].unpsize);
947                 }
948                 fclose(out);
949                 CRLog::trace("totalUncompSizeHdr=%06x realUncompSize=%06x %d blocks of %d", this->_textSize, unpoffset2, k, _records.length());
950         }
951         #endif
952 
953         if (_textSize == (lUInt32)-1)
954             _textSize = unpoffset;
955         else if (unpoffset < _textSize) {
956             CRLog::warn("PDB: Unpacked text size is %d but expected %d", unpoffset, _textSize);
957             _textSize = unpoffset;
958             //return false; // text size does not match
959         }
960 
961 
962         _bufIndex = -1;
963         _bufSize = 0;
964         _bufOffset = 0;
965 
966         SetName(_stream->GetName());
967         m_mode = LVOM_READ;
968 
969 
970         return true;
971     }
972 
973     /// Seek (change file pos)
974     /**
975         \param offset is file offset (bytes) relateve to origin
976         \param origin is offset base
977         \param pNewPos points to place to store new file position
978         \return lverror_t status: LVERR_OK if success
979     */
Seek(lvoffset_t offset,lvseek_origin_t origin,lvpos_t * pNewPos)980     virtual lverror_t Seek( lvoffset_t offset, lvseek_origin_t origin, lvpos_t * pNewPos ) {
981         lvpos_t npos = 0;
982         lvpos_t currpos = _pos;
983         switch (origin) {
984         case LVSEEK_SET:
985             npos = offset;
986             break;
987         case LVSEEK_CUR:
988             npos = currpos + offset;
989             break;
990         case LVSEEK_END:
991             npos = _textSize + offset;
992             break;
993         }
994         if (npos > _textSize)
995             return LVERR_FAIL;
996         if (!seek(npos) )
997             return LVERR_FAIL;
998         if (pNewPos)
999             *pNewPos =  _pos;
1000         return LVERR_OK;
1001     }
1002 
1003     /// Get file position
1004     /**
1005         \return lvpos_t file position
1006     */
GetPos()1007     virtual lvpos_t GetPos()
1008     {
1009         return _pos;
1010     }
1011 
1012     /// Get file size
1013     /**
1014         \return lvsize_t file size
1015     */
GetSize()1016     virtual lvsize_t  GetSize()
1017     {
1018         return _textSize;
1019     }
1020 
GetSize(lvsize_t * pSize)1021     virtual lverror_t GetSize( lvsize_t * pSize )
1022     {
1023         *pSize = _textSize;
1024         return LVERR_OK;
1025     }
1026 
1027     /// Set file size
1028     /**
1029         \param size is new file size
1030         \return lverror_t status: LVERR_OK if success
1031     */
SetSize(lvsize_t size)1032     virtual lverror_t SetSize( lvsize_t size ) {
1033         CR_UNUSED(size);
1034         return LVERR_NOTIMPL;
1035     }
1036 
1037     /// Read
1038     /**
1039         \param buf is buffer to place bytes read from stream
1040         \param count is number of bytes to read from stream
1041         \param nBytesRead is place to store real number of bytes read from stream
1042         \return lverror_t status: LVERR_OK if success
1043     */
Read(void * buf,lvsize_t count,lvsize_t * nBytesRead)1044     virtual lverror_t Read( void * buf, lvsize_t count, lvsize_t * nBytesRead ) {
1045         lvsize_t bytesRead = 0;
1046         if ( nBytesRead )
1047             *nBytesRead = bytesRead;
1048         lUInt8 * dst = (lUInt8 *)buf;
1049         while ( count > 0 ) {
1050             if ( ! seek(_pos) ) {
1051                 if ( _pos>=_textSize )
1052                     break;
1053                 return LVERR_FAIL;
1054             }
1055             int bytesLeft = (int)(_bufOffset + _bufSize - _pos);
1056             if ( bytesLeft<=0 )
1057                 break;
1058             int sz = count;
1059             if ( sz>bytesLeft )
1060                 sz = bytesLeft;
1061             for ( int i=0; i<sz; i++ )
1062                 dst[i] = _buf[_pos - _bufOffset + i];
1063             _pos += sz;
1064             dst += sz;
1065             count -= sz;
1066             bytesRead += sz;
1067         }
1068         if ( nBytesRead )
1069             *nBytesRead = bytesRead;
1070         return LVERR_OK;
1071     }
1072 
1073     /// Write
1074     /**
1075         \param buf is data to write to stream
1076         \param count is number of bytes to write
1077         \param nBytesWritten is place to store real number of bytes written to stream
1078         \return lverror_t status: LVERR_OK if success
1079     */
Write(const void * buf,lvsize_t count,lvsize_t * nBytesWritten)1080     virtual lverror_t Write( const void * buf, lvsize_t count, lvsize_t * nBytesWritten ) {
1081         CR_UNUSED3(buf, count, nBytesWritten);
1082         return LVERR_NOTIMPL;
1083     }
1084 
1085     /// Check whether end of file is reached
1086     /**
1087         \return true if end of file reached
1088     */
Eof()1089     virtual bool Eof() {
1090         return _pos>=_textSize;
1091     }
1092 
getFormat()1093     Format getFormat() { return _format; }
1094 
1095     /// Constructor
PDBFile()1096     PDBFile() {
1097         //_container.AddRef();
1098         _bufIndex = -1;
1099         _mobiExtraDataFlags = 0;
1100         m_doc_props = LVCreatePropsContainer();
1101     }
1102 
1103     /// Destructor
~PDBFile()1104     virtual ~PDBFile() { }
1105 
1106 };
1107 
1108 // open PDB stream from stream
1109 //LVStreamRef LVOpenPDBStream( LVStreamRef srcstream, int &format )
1110 //{
1111 //    PDBFile * stream = PDBFile::create( srcstream, format );
1112 //    srcstream->SetPos(0);
1113 //    if ( stream!=NULL )
1114 //    {
1115 //        return LVStreamRef( stream );
1116 //    }
1117 //    return LVStreamRef();
1118 //}
1119 
DetectPDBFormat(LVStreamRef stream,doc_format_t & contentFormat)1120 bool DetectPDBFormat( LVStreamRef stream, doc_format_t & contentFormat )
1121 {
1122     PDBFile pdb;
1123     if ( !pdb.open(stream, NULL, false, contentFormat) )
1124         return false;
1125     return true;
1126 }
1127 
isCorrectUtf8Text(LVStreamRef & stream)1128 bool isCorrectUtf8Text(LVStreamRef & stream) {
1129     char enc_name[32];
1130     char lang_name[32];
1131     lvpos_t oldpos = stream->GetPos();
1132     unsigned sz = 16384;
1133     stream->SetPos( 0 );
1134     if ( sz>stream->GetSize() )
1135         sz = stream->GetSize();
1136     if (sz < 8)
1137         return false;
1138     unsigned char * buf = new unsigned char[ sz ];
1139     lvsize_t bytesRead = 0;
1140     if ( stream->Read( buf, sz, &bytesRead )!=LVERR_OK ) {
1141         delete[] buf;
1142         stream->SetPos( oldpos );
1143         return false;
1144     }
1145 
1146     int res = 0;
1147     res = AutodetectCodePageUtf(buf, sz, enc_name, lang_name);
1148     delete[] buf;
1149     return res != 0;
1150 }
1151 
GetPDBCoverpage(LVStreamRef stream)1152 LVStreamRef GetPDBCoverpage(LVStreamRef stream)
1153 {
1154     doc_format_t contentFormat = doc_format_none;
1155     PDBFile * pdb = new PDBFile();
1156     LVPDBContainer * container = new LVPDBContainer();
1157     if (!pdb->open(stream, container, false, contentFormat)) {
1158         delete container;
1159         delete pdb;
1160         return LVStreamRef();
1161     }
1162     stream = LVStreamRef(pdb);
1163     LVContainerRef cnt(container);
1164     container->setStream(stream);
1165     LVStreamRef coverStream;
1166     lString32 coverName = pdb->getDocProps()->getStringDef(DOC_PROP_COVER_FILE);
1167     if (!coverName.empty()) {
1168         coverStream = cnt->OpenStream(coverName.c_str(), LVOM_READ);
1169     }
1170     if (!coverStream.isNull()) {
1171         CRLog::trace("Found PDB coverpage image");
1172         return LVCreateMemoryStream(coverStream);
1173     }
1174     return LVStreamRef();
1175  }
1176 
ImportPDBDocument(LVStreamRef & stream,ldomDocument * doc,LVDocViewCallback * progressCallback,CacheLoadingCallback * formatCallback,doc_format_t & contentFormat)1177 bool ImportPDBDocument( LVStreamRef & stream, ldomDocument * doc, LVDocViewCallback * progressCallback, CacheLoadingCallback * formatCallback, doc_format_t & contentFormat )
1178 {
1179     contentFormat = doc_format_none;
1180     PDBFile * pdb = new PDBFile();
1181     LVPDBContainer * container = new LVPDBContainer();
1182     if ( !pdb->open(stream, container, true, contentFormat) ) {
1183         delete container;
1184         delete pdb;
1185         return false;
1186     }
1187     pdb->getDocProps()->set(doc->getProps());
1188     stream = LVStreamRef(pdb);
1189     container->setStream(stream);
1190     doc->setContainer(LVContainerRef(container));
1191 
1192 #if BUILD_LITE!=1
1193     if ( doc->openFromCache(formatCallback) ) {
1194         if ( progressCallback ) {
1195             progressCallback->OnLoadFileEnd( );
1196         }
1197         return true;
1198     }
1199 #endif
1200     doc->getProps()->set(pdb->getDocProps());
1201 
1202     switch ( contentFormat ) {
1203     case doc_format_html:
1204         // HTML
1205         {
1206 
1207             ldomDocumentWriterFilter writerFilter(doc, false,
1208                     HTML_AUTOCLOSE_TABLE);
1209             LVHTMLParser parser(stream, &writerFilter);
1210             parser.setProgressCallback(progressCallback);
1211             if ( !parser.CheckFormat() ) {
1212                 return false;
1213             } else {
1214                 if (pdb->getFormat()==PDBFile::MOBI && isCorrectUtf8Text(stream))
1215                     parser.SetCharset(U"utf-8");
1216                 if (!parser.Parse()) {
1217                     return false;
1218                 }
1219             }
1220         }
1221         break;
1222     default:
1223     //case doc_format_txt:
1224         // TXT
1225         {
1226             ldomDocumentWriter writer(doc);
1227             LVTextParser parser(stream, &writer, false);
1228             parser.setProgressCallback(progressCallback);
1229             if ( !parser.CheckFormat() ) {
1230                 return false;
1231             } else {
1232                 if (!parser.Parse()) {
1233                     return false;
1234                 }
1235             }
1236         }
1237         break;
1238         // PML
1239         {
1240 //            ldomDocumentWriterFilter writerFilter(*doc, false,
1241 //                    HTML_AUTOCLOSE_TABLE);
1242 
1243 //            LVHTMLParser parser(m_stream, &writerFilter);
1244 //            parser->setProgressCallback(progressCallback);
1245 //            if ( !parser->CheckFormat() ) {
1246 //                return false;
1247 //            } else {
1248 //                if (!parser->Parse()) {
1249 //                    return false;
1250 //                }
1251 //            }
1252         }
1253         break;
1254     }
1255 #ifdef DUMP_PDB_CONTENTS
1256     for (int i=0; i<container->GetObjectCount(); i++) {
1257         const LVContainerItemInfo * item = container->GetObjectInfo(i);
1258         if (item->IsContainer())
1259             continue;
1260         lString32 fn = item->GetName();
1261         if (fn.empty())
1262             fn = cs32("pdb_item_") + lString32::itoa(i);
1263         fn = cs32("/tmp/") + fn;
1264         LVStreamRef in = container->OpenStream(item->GetName(), LVOM_READ);
1265         if (in.isNull())
1266             continue;
1267         LVStreamRef out = LVOpenFileStream(fn.c_str(), LVOM_WRITE);
1268         if (out.isNull())
1269             continue;
1270         CRLog::trace("Dumping stream %s (%d)", LCSTR(fn), (int)item->GetSize());
1271         LVPumpStream(out.get(), in.get());
1272     }
1273     {
1274         LVStreamRef out = LVOpenFileStream("/tmp/pdb_main.txt", LVOM_WRITE);
1275         if (!out.isNull()) {
1276             stream->SetPos(0);
1277             CRLog::trace("Dumping stream /tmp/pdb_main.txt (%d)", (int)stream->GetSize());
1278             LVPumpStream(out.get(), stream.get());
1279             stream->SetPos(0);
1280         }
1281     }
1282 #endif
1283 
1284     return true;
1285 }
1286