1 #include "../include/crsetup.h"
2 #include "../include/lvstream.h"
3 //#define CHM_SUPPORT_ENABLED 1
4 #if CHM_SUPPORT_ENABLED==1
5 #include "../include/chmfmt.h"
6 #include "../include/crlog.h"
7 #include <chm_lib.h>
8 
9 #define DUMP_CHM_DOC 0
10 
11 struct crChmExternalFileStream : public chmExternalFileStream {
12     /** returns file size, in bytes, if opened successfully */
13     //LONGUINT64 (open)( chmExternalFileStream * instance );
14     /** reads bytes to buffer */
15     //LONGINT64 (read)( chmExternalFileStream * instance, unsigned char * buf, LONGUINT64 pos, LONGINT64 len );
16     /** closes file */
17     //int (close)( chmExternalFileStream * instance );
18     LVStreamRef stream;
cr_opencrChmExternalFileStream19     static LONGUINT64 cr_open( chmExternalFileStream * instance )
20     {
21         return (LONGINT64)((crChmExternalFileStream*)instance)->stream->GetSize();
22     }
23     /** reads bytes to buffer */
cr_readcrChmExternalFileStream24     static LONGINT64 cr_read( chmExternalFileStream * instance, unsigned char * buf, LONGUINT64 pos, LONGINT64 len )
25     {
26         lvsize_t bytesRead = 0;
27         if ( ((crChmExternalFileStream*)instance)->stream->SetPos( (lvpos_t)pos )!= pos )
28             return 0;
29         if ( ((crChmExternalFileStream*)instance)->stream->Read( buf, (lvsize_t)len, &bytesRead ) != LVERR_OK )
30             return false;
31         return bytesRead;
32     }
33     /** closes file */
cr_closecrChmExternalFileStream34     static int cr_close( chmExternalFileStream * instance )
35     {
36         ((crChmExternalFileStream*)instance)->stream.Clear();
37 		return 0;
38     }
crChmExternalFileStreamcrChmExternalFileStream39     crChmExternalFileStream( LVStreamRef s )
40     : stream(s)
41     {
42         open = cr_open;
43         read = cr_read;
44         close = cr_close;
45     }
46 };
47 
48 class LVCHMStream : public LVNamedStream
49 {
50 protected:
51     chmFile* _file;
52     chmUnitInfo m_ui;
53     lvpos_t m_pos;
54     lvpos_t m_size;
55 public:
LVCHMStream(chmFile * file)56     LVCHMStream( chmFile* file )
57             : _file(file), m_pos(0), m_size(0)
58     {
59     }
open(const char * name)60     bool open( const char * name )
61     {
62         memset(&m_ui, 0, sizeof(m_ui));
63         if ( CHM_RESOLVE_SUCCESS==chm_resolve_object(_file, name, &m_ui ) ) {
64             m_size = (lvpos_t)m_ui.length;
65             return true;
66         }
67         return false;
68     }
69 
Seek(lvoffset_t offset,lvseek_origin_t origin,lvpos_t * pNewPos)70     virtual lverror_t Seek( lvoffset_t offset, lvseek_origin_t origin, lvpos_t * pNewPos )
71     {
72         //
73         lvpos_t newpos = m_pos;
74         switch ( origin )
75         {
76         case LVSEEK_SET:
77             newpos = offset;
78             break;
79         case LVSEEK_CUR:
80             newpos += offset;
81             break;
82         case LVSEEK_END:
83             newpos = m_size + offset;
84             break;
85         }
86         if ( newpos>m_size )
87             return LVERR_FAIL;
88         if ( pNewPos!=NULL )
89             *pNewPos = newpos;
90         m_pos = newpos;
91         return LVERR_OK;
92     }
93 
94     /// Tell current file position
95     /**
96         \param pNewPos points to place to store file position
97         \return lverror_t status: LVERR_OK if success
98     */
Tell(lvpos_t * pPos)99     virtual lverror_t Tell( lvpos_t * pPos )
100     {
101         *pPos = m_pos;
102         return LVERR_OK;
103     }
104 
SetPos(lvpos_t p)105     virtual lvpos_t SetPos(lvpos_t p)
106     {
107         if ( p<=m_size ) {
108             m_pos = p;
109             return m_pos;
110         }
111         return (lvpos_t)(~0);
112     }
113 
114     /// Get file position
115     /**
116         \return lvpos_t file position
117     */
GetPos()118     virtual lvpos_t   GetPos()
119     {
120         return m_pos;
121     }
122 
123     /// Get file size
124     /**
125         \return lvsize_t file size
126     */
GetSize()127     virtual lvsize_t  GetSize()
128     {
129         return m_size;
130     }
131 
GetSize(lvsize_t * pSize)132     virtual lverror_t GetSize( lvsize_t * pSize )
133     {
134         *pSize = m_size;
135         return LVERR_OK;
136     }
137 
Read(void * buf,lvsize_t count,lvsize_t * nBytesRead)138     virtual lverror_t Read( void * buf, lvsize_t count, lvsize_t * nBytesRead )
139     {
140         int cnt = (int)count;
141         if ( m_pos + cnt > m_size )
142             cnt = (int)(m_size - m_pos);
143         if ( cnt <= 0 )
144             return LVERR_FAIL;
145         LONGINT64 gotBytes = chm_retrieve_object(_file, &m_ui, (unsigned char *)buf, m_pos, cnt );
146         m_pos += (lvpos_t)gotBytes;
147         if (nBytesRead)
148             *nBytesRead = (lvsize_t)gotBytes;
149         return LVERR_OK;
150     }
151 
152 
Write(const void *,lvsize_t,lvsize_t *)153     virtual lverror_t Write( const void * /*buf*/, lvsize_t /*count*/, lvsize_t * /*nBytesWritten*/ )
154     {
155         return LVERR_FAIL;
156     }
157 
Eof()158     virtual bool Eof()
159     {
160         return (m_pos >= m_size);
161     }
162 
SetSize(lvsize_t size)163     virtual lverror_t SetSize( lvsize_t size )
164     {
165         CR_UNUSED(size);
166         // support only size grow
167         return LVERR_FAIL;
168     }
169 
170 
171 };
172 
173 class LVCHMContainer : public LVNamedContainer
174 {
175 protected:
176     //LVDirectoryContainer * m_parent;
177     crChmExternalFileStream _stream;
178     chmFile* _file;
179 public:
OpenStream(const lChar32 * fname,lvopen_mode_t mode)180     virtual LVStreamRef OpenStream( const lChar32 * fname, lvopen_mode_t mode )
181     {
182         LVStreamRef stream;
183         if ( mode!=LVOM_READ )
184             return stream;
185 
186         LVCHMStream * p = new LVCHMStream(_file);
187         lString32 fn(fname);
188         if ( fn[0]!='/' )
189             fn = cs32("/") + fn;
190         if ( !p->open( UnicodeToUtf8(lString32(fn)).c_str() )) {
191             delete p;
192             return stream;
193         }
194         stream = p;
195         stream->SetName( fname );
196         return stream;
197     }
GetParentContainer()198     virtual LVContainer * GetParentContainer()
199     {
200         return NULL;
201     }
GetObjectInfo(int index)202     virtual const LVContainerItemInfo * GetObjectInfo(int index)
203     {
204         if (index>=0 && index<m_list.length())
205             return m_list[index];
206         return NULL;
207     }
GetObjectCount() const208     virtual int GetObjectCount() const
209     {
210         return m_list.length();
211     }
GetSize(lvsize_t * pSize)212     virtual lverror_t GetSize( lvsize_t * pSize )
213     {
214         if (m_fname.empty())
215             return LVERR_FAIL;
216         *pSize = GetObjectCount();
217         return LVERR_OK;
218     }
LVCHMContainer(LVStreamRef s)219     LVCHMContainer(LVStreamRef s) : _stream(s), _file(NULL)
220     {
221     }
~LVCHMContainer()222     virtual ~LVCHMContainer()
223     {
224         SetName(NULL);
225         Clear();
226         if ( _file )
227             chm_close( _file );
228     }
229 
addFileItem(const char * filename,LONGUINT64 len)230     void addFileItem( const char * filename, LONGUINT64 len )
231     {
232         LVCommonContainerItemInfo * item = new LVCommonContainerItemInfo();
233         item->SetItemInfo( lString32(filename), (lvsize_t)len, 0, false );
234         //CRLog::trace("CHM file item: %s [%d]", filename, (int)len);
235         Add(item);
236     }
237 
CHM_ENUMERATOR_CALLBACK(struct chmFile *,struct chmUnitInfo * ui,void * context)238     static int CHM_ENUMERATOR_CALLBACK (struct chmFile * /*h*/,
239                               struct chmUnitInfo *ui,
240                               void *context)
241     {
242         LVCHMContainer * c = (LVCHMContainer*)context;
243         if ( (ui->flags & CHM_ENUMERATE_FILES) && (ui->flags & CHM_ENUMERATE_NORMAL) ) {
244             c->addFileItem( ui->path, ui->length );
245         }
246         return CHM_ENUMERATOR_CONTINUE;
247     }
248 
open()249     bool open()
250     {
251         _file = chm_open( &_stream );
252         if ( !_file )
253             return false;
254         chm_enumerate( _file,
255                   CHM_ENUMERATE_ALL,
256                   CHM_ENUMERATOR_CALLBACK,
257                   this);
258         return true;
259     }
260 };
261 
262 /// opens CHM container
LVOpenCHMContainer(LVStreamRef stream)263 LVContainerRef LVOpenCHMContainer( LVStreamRef stream )
264 {
265     LVCHMContainer * chm = new LVCHMContainer(stream);
266     if ( !chm->open() ) {
267         delete chm;
268         return LVContainerRef();
269     }
270     chm->SetName( stream->GetName() );
271     return LVContainerRef( chm );
272 }
273 
DetectCHMFormat(LVStreamRef stream)274 bool DetectCHMFormat( LVStreamRef stream )
275 {
276     stream->SetPos(0);
277     LVContainerRef cont = LVOpenCHMContainer( stream );
278     if ( !cont.isNull() ) {
279         return true;
280     }
281     return false;
282 }
283 
284 class CHMBinaryReader {
285     LVStreamRef _stream;
286 public:
CHMBinaryReader(LVStreamRef stream)287     CHMBinaryReader( LVStreamRef stream ) : _stream(stream) {
288     }
setPos(int offset)289     bool setPos( int offset ) {
290         return (int)_stream->SetPos(offset) == offset;
291     }
eof()292     bool eof() {
293         return _stream->Eof();
294     }
295 
readInt32(bool & error)296     lUInt32 readInt32( bool & error ) {
297         int b1 = _stream->ReadByte();
298         int b2 = _stream->ReadByte();
299         int b3 = _stream->ReadByte();
300         int b4 = _stream->ReadByte();
301         if ( b1==-1 || b2==-1  || b3==-1  || b4==-1 ) {
302             error = true;
303             return 0;
304         }
305         return (lUInt32)(b1 | (b2<<8) | (b3<<16) | (b4<<24));
306     }
readInt16(bool & error)307     lUInt16 readInt16( bool & error ) {
308         int b1 = _stream->ReadByte();
309         int b2 = _stream->ReadByte();
310         if ( b1==-1 || b2==-1 ) {
311             error = true;
312             return 0;
313         }
314         return (lUInt16)(b1 | (b2<<8));
315     }
readInt8(bool & error)316     lUInt8 readInt8( bool & error ) {
317         int b = _stream->ReadByte();
318         if ( b==-1 ) {
319             error = true;
320             return 0;
321         }
322         return (lUInt8)(b & 0xFF);
323     }
bytesLeft()324     int bytesLeft() {
325         return (int)(_stream->GetSize() - _stream->GetPos());
326     }
327 
readBytes(LVArray<lUInt8> & bytes,int offset,int length)328     bool readBytes( LVArray<lUInt8> & bytes, int offset, int length ) {
329         bytes.clear();
330         bytes.reserve(length);
331         if ( offset>=0 )
332             if ((int)_stream->SetPos(offset) != offset)
333                 return false;
334         for (int i = 0; i < length; i++) {
335             int b = _stream->ReadByte();
336             if ( b==-1 )
337                 return false;
338             bytes[i] = (lUInt8)b;
339         }
340         return true;
341     }
342 
343     // offset==-1 to avoid changing position, length==-1 to use 0-terminated
readString(int offset,int length)344     lString8 readString( int offset, int length ) {
345         if ( length==0 )
346             return lString8::empty_str;
347         if ( offset>=0 )
348             if ((int)_stream->SetPos(offset) != offset)
349                 return lString8::empty_str;
350         lString8 res;
351         if ( length>0 )
352             res.reserve(length);
353         bool zfound = false;
354         for ( int i=0; i<length || length==-1; i++ ) {
355             int b = _stream->ReadByte();
356             if (zfound || (b==0 && length>=0)) {
357                 zfound = true;
358                 continue;
359             }
360             if ( b==-1 || b==0 )
361                 break;
362             res.append(1, (lUInt8)b);
363         }
364         return res;
365     }
366     // offset==-1 to avoid changing position, length==-1 to use 0-terminated
readStringUtf16(int offset,int length)367     lString32 readStringUtf16( int offset, int length ) {
368         if ( length==0 )
369             return lString32::empty_str;
370         if ( offset>=0 )
371             if ((int)_stream->SetPos(offset) != offset)
372                 return lString32::empty_str;
373         lString32 res;
374         if ( length>0 )
375             res.reserve(length);
376         for ( int i=0; i<length || length==-1; i++ ) {
377             int b1 = _stream->ReadByte();
378             if ( b1==-1 || b1==0 )
379                 break;
380             int b2 = _stream->ReadByte();
381             if ( b2==-1 || b2==0 )
382                 break;
383             res.append(1, (lChar32)(b1 | (b2<<16)));
384         }
385         return res;
386     }
readEncInt()387     lInt64 readEncInt() {
388         lInt64 res = 0;
389         int shift = 0;
390         int b = 0;
391         do {
392             b = _stream->ReadByte();
393             if ( b==-1 )
394                 return 0;
395             res |= ( ((lInt64)(b&0x7F)) << shift );
396             shift+=7;
397         } while ( b&0x80 );
398         return res;
399     }
400 };
401 
402 class CHMUrlStrEntry {
403 public:
404     lUInt32 offset;
405     lString8 url;
406 };
407 
408 const int URLSTR_BLOCK_SIZE = 0x1000;
409 class CHMUrlStr {
410     LVContainerRef _container;
411     CHMBinaryReader _reader;
412     LVPtrVector<CHMUrlStrEntry> _table;
413 
CHMUrlStr(LVContainerRef container,LVStreamRef stream)414     CHMUrlStr( LVContainerRef container, LVStreamRef stream ) : _container(container), _reader(stream)
415     {
416 
417     }
readInt32(const lUInt8 * & data)418     lUInt32 readInt32( const lUInt8 * & data ) {
419         lUInt32 res = 0;
420         res = *(data++);
421         res = res | (((lUInt32)(*(data++))) << 8);
422         res = res | (((lUInt32)(*(data++))) << 16);
423         res = res | (((lUInt32)(*(data++))) << 24);
424         return res;
425     }
readString(const lUInt8 * & data,int maxlen)426     lString8 readString( const lUInt8 * & data, int maxlen ) {
427         lString8 res;
428         for ( int i=0; i<maxlen; i++ ) {
429             lUInt8 b = *data++;
430             if ( b==0 )
431                 break;
432             res.append(1, b);
433         }
434         return res;
435     }
436 
437 
decodeBlock(const lUInt8 * ptr,lUInt32 blockOffset,int size)438     bool decodeBlock( const lUInt8 * ptr, lUInt32 blockOffset, int size ) {
439         const lUInt8 * data = ptr;
440         const lUInt8 * maxdata = ptr + size;
441         while ( data + 8 < maxdata ) {
442             lUInt32 offset = (lUInt32)(blockOffset + (data - ptr));
443             //lUInt32 urlOffset =
444             readInt32(data);
445             //lUInt32 frameOffset =
446             readInt32(data);
447             if ( data < maxdata ) { //urlOffset > offset ) {
448                 CHMUrlStrEntry * item = new CHMUrlStrEntry();
449                 item->offset = offset;
450                 item->url = readString(data, (int)(maxdata - data));
451                 //CRLog::trace("urlstr[offs=%x, url=%s]", item->offset, item->url.c_str());
452                 _table.add( item );
453             }
454         }
455         return true;
456     }
457 
read()458     bool read() {
459         bool err = false;
460         LVArray<lUInt8> bytes;
461         _reader.readInt8(err);
462         lUInt32 offset = 1;
463         while ( !_reader.eof() && !err ) {
464             int sz = _reader.bytesLeft();
465             if ( sz>URLSTR_BLOCK_SIZE )
466                 sz = URLSTR_BLOCK_SIZE;
467             err = !_reader.readBytes(bytes, -1, sz) || err;
468             if ( err )
469                 break;
470             err = !decodeBlock( bytes.get(), offset, sz ) || err;
471             offset += sz;
472         }
473         return !err;
474     }
475 public:
open(LVContainerRef container)476     static CHMUrlStr * open( LVContainerRef container ) {
477         LVStreamRef stream = container->OpenStream(U"#URLSTR", LVOM_READ);
478         if ( stream.isNull() )
479             return NULL;
480         CHMUrlStr * res = new CHMUrlStr( container, stream );
481         if ( !res->read() ) {
482             delete res;
483             return NULL;
484         }
485         CRLog::info("CHM URLSTR: %d entries read", res->_table.length());
486         return res;
487     }
findByOffset(lUInt32 offset)488     lString8 findByOffset( lUInt32 offset ) {
489         for ( int i=0; i<_table.length(); i++ ) {
490             if ( _table[i]->offset==offset )
491                 return _table[i]->url;
492         }
493         return lString8::empty_str;
494     }
getUrlList(lString32Collection & urlList)495     void getUrlList( lString32Collection & urlList ) {
496         for ( int i=0; i<_table.length(); i++ ) {
497             lString8 s = _table[i]->url;
498             if ( !s.empty() ) {
499                 urlList.add(Utf8ToUnicode(s));
500             }
501         }
502     }
503 };
504 
505 class CHMUrlTableEntry {
506 public:
507     lUInt32 offset;
508     lUInt32 id;
509     lUInt32 topicsIndex;
510     lUInt32 urlStrOffset;
CHMUrlTableEntry()511     CHMUrlTableEntry()
512     : offset(0)
513     , id(0)
514     , topicsIndex(0)
515     , urlStrOffset(0)
516     {
517 
518     }
519 };
520 
521 const int URLTBL_BLOCK_SIZE = 0x1000;
522 const int URLTBL_BLOCK_RECORD_COUNT = 341;
523 class CHMUrlTable {
524     LVContainerRef _container;
525     CHMBinaryReader _reader;
526     LVPtrVector<CHMUrlTableEntry> _table;
527     CHMUrlStr * _strings;
528 
529 
CHMUrlTable(LVContainerRef container,LVStreamRef stream)530     CHMUrlTable( LVContainerRef container, LVStreamRef stream ) : _container(container), _reader(stream), _strings(NULL)
531     {
532 
533     }
readInt32(const lUInt8 * & data)534     lUInt32 readInt32( const lUInt8 * & data ) {
535         lUInt32 res = 0;
536         res = *(data++);
537         res = res | (((lUInt32)(*(data++))) << 8);
538         res = res | (((lUInt32)(*(data++))) << 16);
539         res = res | (((lUInt32)(*(data++))) << 24);
540         return res;
541     }
542 
decodeBlock(const lUInt8 * data,lUInt32 offset,int size)543     bool decodeBlock( const lUInt8 * data, lUInt32 offset, int size ) {
544         for ( int i=0; i<URLTBL_BLOCK_RECORD_COUNT && size>0; i++ ) {
545             CHMUrlTableEntry * item = new CHMUrlTableEntry();
546             item->offset = offset;
547             item->id = readInt32(data);
548             item->topicsIndex = readInt32(data);
549             item->urlStrOffset = readInt32(data);
550             //CRLog::trace("urltbl[offs=%x, id=%x, ti=%x, urloffs=%x]", item->offset, item->id, item->topicsIndex, item->urlStrOffset);
551             _table.add( item );
552             offset += 4*3;
553             size -= 4*3;
554         }
555         return true;
556     }
557 
read()558     bool read() {
559         bool err = false;
560         LVArray<lUInt8> bytes;
561         lUInt32 offset = 0;
562         while ( !_reader.eof() && !err ) {
563             int sz = _reader.bytesLeft();
564             if ( sz>URLTBL_BLOCK_SIZE )
565                 sz = URLTBL_BLOCK_SIZE;
566             err = !_reader.readBytes(bytes, -1, sz) || err;
567             if ( err )
568                 break;
569             err = !decodeBlock( bytes.get(), offset, sz ) || err;
570             offset += sz;
571         }
572         _strings = CHMUrlStr::open(_container);
573         if ( !_strings ) {
574             CRLog::warn("CHM: cannot read #URLSTR");
575         }
576         return !err;
577     }
578 public:
~CHMUrlTable()579     ~CHMUrlTable() {
580         if ( _strings )
581             delete _strings;
582     }
583 
open(LVContainerRef container)584     static CHMUrlTable * open( LVContainerRef container ) {
585         LVStreamRef stream = container->OpenStream(U"#URLTBL", LVOM_READ);
586         if ( stream.isNull() )
587             return NULL;
588         CHMUrlTable * res = new CHMUrlTable( container, stream );
589         if ( !res->read() ) {
590             delete res;
591             return NULL;
592         }
593         CRLog::info("CHM URLTBL: %d entries read", res->_table.length());
594         return res;
595     }
596 
urlById(lUInt32 id)597     lString8 urlById( lUInt32 id ) {
598         if ( !_strings )
599             return lString8::empty_str;
600         for ( int i=0; i<_table.length(); i++ ) {
601             if ( _table[i]->id==id )
602                 return _strings->findByOffset( _table[i]->urlStrOffset );
603         }
604         return lString8::empty_str;
605     }
606 
findById(lUInt32 id)607     CHMUrlTableEntry * findById( lUInt32 id ) {
608         for ( int i=0; i<_table.length(); i++ ) {
609             if ( _table[i]->id==id )
610                 return _table[i];
611         }
612         return NULL;
613     }
findByOffset(lUInt32 offset)614     CHMUrlTableEntry * findByOffset( lUInt32 offset ) {
615         for ( int i=0; i<_table.length(); i++ ) {
616             if ( _table[i]->offset==offset )
617                 return _table[i];
618         }
619         return NULL;
620     }
621 
getUrlList(lString32Collection & urlList)622     void getUrlList( lString32Collection & urlList ) {
623         if ( !_strings )
624             return;
625         _strings->getUrlList( urlList );
626 //        for ( int i=0; i<_table.length(); i++ ) {
627 //            lString8 s = _strings->findByOffset( _table[i]->urlStrOffset );
628 //            if ( !s.empty() ) {
629 //                urlList.add(Utf8ToUnicode(s));
630 //            }
631 //        }
632     }
633 };
634 
635 class CHMSystem {
636 
637     LVContainerRef _container;
638     CHMBinaryReader _reader;
639     lUInt32 _fileVersion;
640     lString8 _contentsFile;
641     lString8 _indexFile;
642     lString8 _defaultTopic;
643     lString8 _title;
644     lString8 _language;
645     lString8 _defaultFont;
646     lUInt32  _lcid;
647     bool _dbcs;
648     bool _fullTextSearch;
649     bool _hasKLinks;
650     bool _hasALinks;
651     lUInt32 _binaryIndexURLTableId;
652     lUInt32 _binaryTOCURLTableId;
653     const lChar32 * _enc_table;
654     lString32 _enc_name;
655     CHMUrlTable * _urlTable;
656 
CHMSystem(LVContainerRef container,LVStreamRef stream)657     CHMSystem( LVContainerRef container, LVStreamRef stream ) : _container(container), _reader(stream)
658     , _fileVersion(0)
659     , _lcid(0)
660     , _dbcs(false)
661     , _fullTextSearch(false)
662     , _hasKLinks(false)
663     , _hasALinks(false)
664     , _binaryIndexURLTableId(0)
665     , _binaryTOCURLTableId(0)
666     , _enc_table(NULL)
667     , _urlTable(NULL)
668     {
669     }
670 
decodeEntry()671     bool decodeEntry() {
672         bool err = false;
673         int code = _reader.readInt16(err);
674         int length = _reader.readInt16(err);
675         //CRLog::trace("CHM binary item code=%d, length=%d, bytesLeft=%d", code, length, _reader.bytesLeft());
676         if ( err )
677             return false;
678         LVArray<lUInt8> bytes;
679         switch( code ) {
680         case 0:
681             _contentsFile = _reader.readString(-1, length);
682             break;
683         case 1:
684             _indexFile = _reader.readString(-1, length);
685             break;
686         case 2:
687             _defaultTopic = _reader.readString(-1, length);
688             break;
689         case 3:
690             _title = _reader.readString(-1, length);
691             break;
692         case 4:
693             {
694                 _lcid = _reader.readInt32(err);
695                 int codepage = langToCodepage( _lcid );
696                 const lChar32 * enc_name = GetCharsetName( codepage );
697                 const lChar32 * table = GetCharsetByte2UnicodeTable( codepage );
698 		_language = langToLanguage( _lcid );
699                 if ( enc_name!=NULL ) {
700                     _enc_table = table;
701                     _enc_name = lString32(enc_name);
702                     CRLog::info("CHM LCID: %08x, charset=%s", _lcid, LCSTR(_enc_name));
703                 } else {
704                     CRLog::info("CHM LCID: %08x -- cannot find charset encoding table", _lcid);
705                 }
706                 _dbcs = _reader.readInt32(err)==1;
707                 _fullTextSearch = _reader.readInt32(err)==1;
708                 _hasKLinks = _reader.readInt32(err)==1;
709                 _hasALinks = _reader.readInt32(err)==1;
710                 err = !_reader.readBytes(bytes, -1, length - (5*4)) || err;
711             }
712             break;
713         case 7:
714             if ( _fileVersion>2 )
715                 _binaryIndexURLTableId = _reader.readInt32(err);
716             else
717                 err = !_reader.readBytes(bytes, -1, length) || err;
718             break;
719         case 11:
720             if ( _fileVersion>2 )
721                 _binaryTOCURLTableId = _reader.readInt32(err);
722             else
723                 err = !_reader.readBytes(bytes, -1, length) || err;
724             break;
725         case 16:
726             _defaultFont = _reader.readString(-1, length);
727             CRLog::info("CHM default font: %s", _defaultFont.c_str());
728             if ( _enc_table==NULL ) {
729                 for ( int i=_defaultFont.length()-1; i>0; i-- ) {
730                     if ( _defaultFont[i]==',' ) {
731                         int cs = _defaultFont.substr(i+1, _defaultFont.length()-i-1).atoi();
732                         const lChar32 * cpname = NULL;
733                         switch (cs) {
734                         case 0x00: cpname = U"windows-1252"; break;
735                         case 0xCC: cpname = U"windows-1251"; break;
736                         case 0xEE: cpname = U"windows-1250"; break;
737                         case 0xA1: cpname = U"windows-1253"; break;
738                         case 0xA2: cpname = U"windows-1254"; break;
739                         case 0xBA: cpname = U"windows-1257"; break;
740                         case 0xB1: cpname = U"windows-1255"; break;
741                         case 0xB2: cpname = U"windows-1256"; break;
742                         default: break;
743                         }
744                         const lChar32 * table = GetCharsetByte2UnicodeTable( cpname );
745                         if ( cpname!=NULL && table!=NULL ) {
746                             CRLog::info("CHM charset detected from default font: %s", LCSTR(lString32(cpname)));
747                             _enc_table = table;
748                             _enc_name = lString32(cpname);
749                         }
750                         break;
751                     }
752                 }
753             }
754             break;
755         default:
756             err = !_reader.readBytes(bytes, -1, length) || err;
757             break;
758         }
759         return !err;
760     }
761 
read()762     bool read() {
763         bool err = false;
764         _fileVersion = _reader.readInt32(err);
765         int count = 0;
766         while ( !_reader.eof() && !err ) {
767             err = !decodeEntry() || err;
768             if ( !err )
769                 count++;
770         }
771 
772         if ( err ) {
773             CRLog::error("CHM decoding error: %d blocks decoded, stream bytes left=%d", count, _reader.bytesLeft() );
774             return false;
775         }
776         if ( _enc_table==NULL ) {
777             _enc_table = GetCharsetByte2UnicodeTable( 1252 );
778             _enc_name = cs32("windows-1252");
779         }
780         _urlTable = CHMUrlTable::open(_container);
781         return !err;
782     }
783 
784 public:
~CHMSystem()785     ~CHMSystem() {
786         if ( _urlTable!=NULL )
787             delete _urlTable;
788     }
789 
open(LVContainerRef container)790     static CHMSystem * open( LVContainerRef container ) {
791         LVStreamRef stream = container->OpenStream(U"#SYSTEM", LVOM_READ);
792         if ( stream.isNull() )
793             return NULL;
794         CHMSystem * res = new CHMSystem( container, stream );
795         if ( !res->read() ) {
796             delete res;
797             return NULL;
798         }
799         return res;
800     }
801 
decodeString(const lString8 & str)802     lString32 decodeString( const lString8 & str ) {
803         return ByteToUnicode( str, _enc_table );
804     }
805 
getTitle()806     lString32 getTitle() {
807         return decodeString(_title);
808     }
809 
getLanguage()810     lString32 getLanguage() {
811         return decodeString(_language);
812     }
813 
getDefaultTopic()814     lString32 getDefaultTopic() {
815         return decodeString(_defaultTopic);
816     }
817 
getEncodingName()818     lString32 getEncodingName() {
819         return _enc_name;
820     }
821 
getContentsFileName()822     lString32 getContentsFileName() {
823         if ( _binaryTOCURLTableId!=0 ) {
824             lString8 url = _urlTable->urlById(_binaryTOCURLTableId);
825             if ( !url.empty() )
826                 return decodeString(url);
827         }
828         if ( _contentsFile.empty() ) {
829             lString32 hhcName;
830             int bestSize = 0;
831             for ( int i=0; i<_container->GetObjectCount(); i++ ) {
832                 const LVContainerItemInfo * item = _container->GetObjectInfo(i);
833                 if ( !item->IsContainer() ) {
834                     lString32 name = item->GetName();
835                     int sz = item->GetSize();
836                     //CRLog::trace("CHM item: %s", LCSTR(name));
837                     lString32 lname = name;
838                     lname.lowercase();
839                     if ( lname.endsWith(".hhc") ) {
840                         if ( sz > bestSize ) {
841                             hhcName = name;
842                             bestSize = sz;
843                         }
844                     }
845                 }
846             }
847             if ( !hhcName.empty() )
848                 return hhcName;
849         }
850         return decodeString(_contentsFile);
851     }
getUrlList(lString32Collection & urlList)852     void getUrlList( lString32Collection & urlList ) {
853         if ( !_urlTable )
854             return;
855         _urlTable->getUrlList(urlList);
856     }
857 };
858 
LVParseCHMHTMLStream(LVStreamRef stream,lString32 defEncodingName)859 ldomDocument * LVParseCHMHTMLStream( LVStreamRef stream, lString32 defEncodingName )
860 {
861     if ( stream.isNull() )
862         return NULL;
863 
864     // detect encondig
865     stream->SetPos(0);
866 
867 #if 0
868     ldomDocument * encDetectionDoc = LVParseHTMLStream( stream );
869     int encoding = 0;
870     if ( encDetectionDoc!=NULL ) {
871         ldomNode * node = encDetectionDoc->nodeFromXPath(U"/html/body/object[1]");
872         if ( node!=NULL ) {
873             for ( int i=0; i<node->getChildCount(); i++ ) {
874                 ldomNode * child = node->getChildNode(i);
875                 if (child && child->isElement() && child->getNodeName() == "param" && child->getAttributeValue(U"name") == "Font") {
876                     lString32 s = child->getAttributeValue(U"value");
877                     lString32 lastDigits;
878                     for ( int i=s.length()-1; i>=0; i-- ) {
879                         lChar32 ch = s[i];
880                         if ( ch>='0' && ch<='9' )
881                             lastDigits.insert(0, 1, ch);
882                         else
883                             break;
884                     }
885                     encoding = lastDigits.atoi();
886                     CRLog::debug("LVParseCHMHTMLStream: encoding detected: %d", encoding);
887                 }
888             }
889         }
890         delete encDetectionDoc;
891     }
892     const lChar32 * enc = U"cp1252";
893     if ( encoding==1 ) {
894         enc = U"cp1251";
895     }
896 #endif
897 
898     stream->SetPos(0);
899     bool error = true;
900     ldomDocument * doc;
901     doc = new ldomDocument();
902     doc->setDocFlags( 0 );
903 
904     ldomDocumentWriterFilter writerFilter(doc, false, HTML_AUTOCLOSE_TABLE);
905     writerFilter.setFlags(writerFilter.getFlags() | TXTFLG_CONVERT_8BIT_ENTITY_ENCODING);
906 
907     /// FB2 format
908     LVFileFormatParser * parser = new LVHTMLParser(stream, &writerFilter);
909     if ( !defEncodingName.empty() )
910         parser->SetCharset(defEncodingName.c_str());
911     if ( parser->CheckFormat() ) {
912         if ( parser->Parse() ) {
913             error = false;
914         }
915     }
916     delete parser;
917     if ( error ) {
918         delete doc;
919         doc = NULL;
920     }
921     return doc;
922 }
923 
filename_comparator(lString32 & _s1,lString32 & _s2)924 static int filename_comparator(lString32 & _s1, lString32 & _s2) {
925     lString32 s1 = _s1.substr(1);
926     lString32 s2 = _s2.substr(1);
927     if (s1.endsWith(".htm"))
928         s1.erase(s1.length()-4, 4);
929     else if (s1.endsWith(".html"))
930         s1.erase(s1.length()-5, 5);
931     if (s2.endsWith(".htm"))
932         s2.erase(s2.length()-4, 4);
933     else if (s2.endsWith(".html"))
934         s2.erase(s2.length()-5, 5);
935     if (s1 == "index")
936         return -1;
937     else if (s2 == "index")
938         return 1;
939     if (s1 == "header")
940         return -1;
941     else if (s2 == "header")
942         return 1;
943     int d1 = 0;
944     int d2 = 0;
945     s1.atoi(d1);
946     s2.atoi(d2);
947     if (d1 || d2) {
948         if (d1 && d2) {
949             if (d1 < d2)
950                 return -1;
951             else if (d1 > d2)
952                 return 1;
953             return 0;
954         } else if (d1) {
955             return -1;
956         } else {
957             return 1;
958         }
959     }
960     return s1.compare(s2);
961 }
962 
963 class CHMTOCReader {
964     LVContainerRef _cont;
965     ldomDocumentFragmentWriter * _appender;
966     ldomDocument * _doc;
967     LVTocItem * _toc;
968     lString32HashedCollection _fileList;
969     lString32 lastFile;
970     lString32 _defEncodingName;
971     bool _fakeToc;
972 public:
CHMTOCReader(LVContainerRef cont,ldomDocument * doc,ldomDocumentFragmentWriter * appender)973     CHMTOCReader( LVContainerRef cont, ldomDocument * doc, ldomDocumentFragmentWriter * appender )
974         : _cont(cont), _appender(appender), _doc(doc), _fileList(1024)
975     {
976         _toc = _doc->getToc();
977     }
addFile(const lString32 & v1)978     void addFile( const lString32 & v1 ) {
979         int index = _fileList.find(v1.c_str());
980         if ( index>=0 )
981             return; // already added
982         _fileList.add(v1.c_str());
983         CRLog::trace("New source file: %s", LCSTR(v1) );
984         _appender->addPathSubstitution( v1, cs32("_doc_fragment_") + fmt::decimal(_fileList.length()) );
985         _appender->setCodeBase( v1 );
986     }
987 
addTocItem(lString32 name,lString32 url,int level)988     void addTocItem( lString32 name, lString32 url, int level )
989     {
990         //CRLog::trace("CHM toc level %d: '%s' : %s", level, LCSTR(name), LCSTR(url) );
991         if (url.startsWith(".."))
992             url = LVExtractFilename( url );
993         lString32 v1, v2;
994         if ( !url.split2(cs32("#"), v1, v2) )
995             v1 = url;
996         PreProcessXmlString( name, 0 );
997         addFile(v1);
998         lString32 url2 = _appender->convertHref(url);
999         //CRLog::trace("new url: %s", LCSTR(url2) );
1000         while ( _toc->getLevel()>level && _toc->getParent() )
1001             _toc = _toc->getParent();
1002         _toc = _toc->addChild(name, ldomXPointer(), url2);
1003     }
1004 
recurseToc(ldomNode * node,int level)1005     void recurseToc( ldomNode * node, int level )
1006     {
1007         lString32 nodeName = node->getNodeName();
1008         lUInt16 paramElemId = node->getDocument()->getElementNameIndex(U"param");
1009         if (nodeName == "object") {
1010             if ( level>0 ) {
1011                 // process object
1012                 if (node->getAttributeValue("type") == "text/sitemap") {
1013                     lString32 name, local;
1014                     int cnt = node->getChildCount();
1015                     for ( int i=0; i<cnt; i++ ) {
1016                         ldomNode * child = node->getChildElementNode(i, paramElemId);
1017                         if ( child ) {
1018                             lString32 paramName = child->getAttributeValue("name");
1019                             lString32 paramValue = child->getAttributeValue("value");
1020                             if (paramName == "Name")
1021                                 name = paramValue;
1022                             else if (paramName == "Local")
1023                                 local = paramValue;
1024                         }
1025                     }
1026                     if ( !local.empty() && !name.empty() ) {
1027                         // found!
1028                         addTocItem( name, local, level );
1029                     }
1030                 }
1031             }
1032             return;
1033         }
1034         if (nodeName == "ul")
1035             level++;
1036         int cnt = node->getChildCount();
1037         for ( int i=0; i<cnt; i++ ) {
1038             ldomNode * child = node->getChildElementNode(i);
1039             if ( child ) {
1040                 recurseToc( child, level );
1041             }
1042         }
1043     }
1044 
init(LVContainerRef cont,lString32 hhcName,lString32 defEncodingName,lString32Collection & urlList,lString32 mainPageName)1045     bool init( LVContainerRef cont, lString32 hhcName, lString32 defEncodingName, lString32Collection & urlList, lString32 mainPageName )
1046     {
1047         if ( hhcName.empty() && urlList.length()==0 ) {
1048             lString32Collection htms;
1049             for (int i=0; i<cont->GetObjectCount(); i++) {
1050                 const LVContainerItemInfo * item = cont->GetObjectInfo(i);
1051                 if (item->IsContainer())
1052                     continue;
1053                 lString32 name = item->GetName();
1054                 if (name == "/bookindex.htm" || name == "/headerindex.htm")
1055                     continue;
1056                 //CRLog::trace("item %d : %s", i, LCSTR(name));
1057                 if (name.endsWith(".htm") || name.endsWith(".html"))
1058                     htms.add(name);
1059             }
1060             if (!htms.length())
1061                 return false;
1062 //            {
1063 //                for (int j=0; j<htms.length(); j++) {
1064 //                    CRLog::trace("unsorted %d : %s", j, LCSTR(htms[j]));
1065 //                }
1066 //            }
1067             htms.sort(filename_comparator);
1068 //            {
1069 //                for (int j=0; j<htms.length(); j++) {
1070 //                    CRLog::trace("sorted %d : %s", j, LCSTR(htms[j]));
1071 //                }
1072 //            }
1073             urlList.addAll(htms);
1074         }
1075         _defEncodingName = defEncodingName;
1076 
1077         if ( !mainPageName.empty() )
1078             addFile(mainPageName);
1079 
1080         if ( hhcName.empty() ) {
1081             _fakeToc = true;
1082             for ( int i=0; i<urlList.length(); i++ ) {
1083                 //lString32 name = lString32::itoa(i+1);
1084                 lString32 name = urlList[i];
1085                 if ( name.endsWith(".htm") )
1086                     name = name.substr(0, name.length()-4);
1087                 else if ( name.endsWith(".html") )
1088                     name = name.substr(0, name.length()-5);
1089                 if (name.startsWith("/"))
1090                     name = name.substr(1);
1091                 addTocItem( name, urlList[i], 0 );
1092             }
1093             return true;
1094         } else {
1095             _fakeToc = false;
1096             LVStreamRef tocStream = cont->OpenStream(hhcName.c_str(), LVOM_READ);
1097             if ( tocStream.isNull() ) {
1098                 CRLog::error("CHM: Cannot open .hhc");
1099                 return false;
1100             }
1101             ldomDocument * doc = LVParseCHMHTMLStream( tocStream, defEncodingName );
1102             if ( !doc ) {
1103                 CRLog::error("CHM: Cannot parse .hhc");
1104                 return false;
1105             }
1106 
1107     #if DUMP_CHM_DOC==1
1108         LVStreamRef out = LVOpenFileStream(U"/tmp/chm-toc.html", LVOM_WRITE);
1109         if ( !out.isNull() )
1110             doc->saveToStream( out, NULL, true );
1111     #endif
1112 
1113             ldomNode * body = doc->getRootNode(); //doc->createXPointer(cs32("/html[1]/body[1]"));
1114             bool res = false;
1115             if ( body->isElement() ) {
1116                 // body element
1117                 recurseToc( body, 0 );
1118                 // add rest of pages
1119                 for ( int i=0; i<urlList.length(); i++ ) {
1120                     lString32 name = urlList[i];
1121                     if ( name.endsWith(".htm") || name.endsWith(".html") )
1122                         addFile(name);
1123                 }
1124 
1125                 res = _fileList.length()>0;
1126                 while ( _toc && _toc->getParent() )
1127                     _toc = _toc->getParent();
1128                 if ( res && _toc && _toc->getChildCount()>0 ) {
1129                     lString32 name = _toc->getChild(0)->getName();
1130                     CRPropRef m_doc_props = _doc->getProps();
1131                     m_doc_props->setString(DOC_PROP_TITLE, name);
1132                 }
1133             }
1134             delete doc;
1135             return res;
1136         }
1137     }
appendFragments(LVDocViewCallback * progressCallback)1138     int appendFragments( LVDocViewCallback * progressCallback )
1139     {
1140         int appendedFragments = 0;
1141         time_t lastProgressTime = (time_t)time(0);
1142         int lastProgressPercent = -1;
1143         int cnt = _fileList.length();
1144         for ( int i=0; i<cnt; i++ ) {
1145             if ( progressCallback ) {
1146                 int percent = i * 100 / cnt;
1147                 time_t ts = (time_t)time(0);
1148                 if ( ts>lastProgressTime && percent>lastProgressPercent ) {
1149                     progressCallback->OnLoadFileProgress( percent );
1150                     lastProgressTime = ts;
1151                     lastProgressPercent = percent;
1152                 }
1153             }
1154             lString32 fname = _fileList[i];
1155             CRLog::trace("Import file %s", LCSTR(fname));
1156             LVStreamRef stream = _cont->OpenStream(fname.c_str(), LVOM_READ);
1157             if ( stream.isNull() )
1158                 continue;
1159             _appender->setCodeBase(fname);
1160             LVHTMLParser parser(stream, _appender);
1161             parser.SetCharset(_defEncodingName.c_str());
1162             if ( parser.CheckFormat() && parser.Parse() ) {
1163                 // valid
1164                 appendedFragments++;
1165             } else {
1166                 CRLog::error("Document type is not HTML for fragment %s", LCSTR(fname));
1167             }
1168             appendedFragments++;
1169         }
1170         return appendedFragments;
1171     }
1172 };
1173 
ImportCHMDocument(LVStreamRef stream,ldomDocument * doc,LVDocViewCallback * progressCallback,CacheLoadingCallback * formatCallback)1174 bool ImportCHMDocument( LVStreamRef stream, ldomDocument * doc, LVDocViewCallback * progressCallback, CacheLoadingCallback * formatCallback )
1175 {
1176     stream->SetPos(0);
1177     LVContainerRef cont = LVOpenCHMContainer( stream );
1178     if ( cont.isNull() ) {
1179         stream->SetPos(0);
1180         return false;
1181     }
1182     doc->setContainer(cont);
1183 
1184 #if BUILD_LITE!=1
1185     if ( doc->openFromCache(formatCallback) ) {
1186         if ( progressCallback ) {
1187             progressCallback->OnLoadFileEnd( );
1188         }
1189         return true;
1190     }
1191 #endif
1192 
1193     CHMSystem * chm = CHMSystem::open(cont);
1194     if ( !chm )
1195         return false;
1196     lString32 tocFileName = chm->getContentsFileName();
1197     lString32 defEncodingName = chm->getEncodingName();
1198     lString32 mainPageName = chm->getDefaultTopic();
1199     lString32 title = chm->getTitle();
1200     lString32 language = chm->getLanguage();
1201     CRLog::info("CHM: toc=%s, enc=%s, title=%s", LCSTR(tocFileName), LCSTR(defEncodingName), LCSTR(title));
1202     //
1203     lString32Collection urlList;
1204     chm->getUrlList(urlList);
1205     delete chm;
1206 
1207     int fragmentCount = 0;
1208     ldomDocumentWriterFilter writer(doc, false, HTML_AUTOCLOSE_TABLE);
1209     //ldomDocumentWriter writer(doc);
1210     writer.OnStart(NULL);
1211     writer.OnTagOpenNoAttr(U"", U"body");
1212     ldomDocumentFragmentWriter appender(&writer, cs32("body"), cs32("DocFragment"), lString32::empty_str );
1213     CHMTOCReader tocReader(cont, doc, &appender);
1214     if ( !tocReader.init(cont, tocFileName, defEncodingName, urlList, mainPageName) )
1215         return false;
1216 
1217     if ( !title.empty() )
1218         doc->getProps()->setString(DOC_PROP_TITLE, title);
1219     if ( !language.empty() )
1220         doc->getProps()->setString(DOC_PROP_LANGUAGE, language);
1221 
1222     fragmentCount = tocReader.appendFragments( progressCallback );
1223     writer.OnTagClose(U"", U"body");
1224     writer.OnStop();
1225     CRLog::debug("CHM: %d documents merged", fragmentCount);
1226 #if DUMP_CHM_DOC==1
1227     LVStreamRef out = LVOpenFileStream(U"/tmp/chm.html", LVOM_WRITE);
1228     if ( !out.isNull() )
1229         doc->saveToStream( out, NULL, true );
1230 #endif
1231 
1232     return fragmentCount>0;
1233 }
1234 
1235 #endif
1236