1 #include "../include/crsetup.h"
2 #include "../include/lvstream.h"
3 //#define CHM_SUPPORT_ENABLED 1
4 #if CHM_SUPPORT_ENABLED==1
5 #include "../include/chmfmt.h"
6 #include "../include/crlog.h"
7 #include <chm_lib.h>
8
9 #define DUMP_CHM_DOC 0
10
11 struct crChmExternalFileStream : public chmExternalFileStream {
12 /** returns file size, in bytes, if opened successfully */
13 //LONGUINT64 (open)( chmExternalFileStream * instance );
14 /** reads bytes to buffer */
15 //LONGINT64 (read)( chmExternalFileStream * instance, unsigned char * buf, LONGUINT64 pos, LONGINT64 len );
16 /** closes file */
17 //int (close)( chmExternalFileStream * instance );
18 LVStreamRef stream;
cr_opencrChmExternalFileStream19 static LONGUINT64 cr_open( chmExternalFileStream * instance )
20 {
21 return (LONGINT64)((crChmExternalFileStream*)instance)->stream->GetSize();
22 }
23 /** reads bytes to buffer */
cr_readcrChmExternalFileStream24 static LONGINT64 cr_read( chmExternalFileStream * instance, unsigned char * buf, LONGUINT64 pos, LONGINT64 len )
25 {
26 lvsize_t bytesRead = 0;
27 if ( ((crChmExternalFileStream*)instance)->stream->SetPos( (lvpos_t)pos )!= pos )
28 return 0;
29 if ( ((crChmExternalFileStream*)instance)->stream->Read( buf, (lvsize_t)len, &bytesRead ) != LVERR_OK )
30 return false;
31 return bytesRead;
32 }
33 /** closes file */
cr_closecrChmExternalFileStream34 static int cr_close( chmExternalFileStream * instance )
35 {
36 ((crChmExternalFileStream*)instance)->stream.Clear();
37 return 0;
38 }
crChmExternalFileStreamcrChmExternalFileStream39 crChmExternalFileStream( LVStreamRef s )
40 : stream(s)
41 {
42 open = cr_open;
43 read = cr_read;
44 close = cr_close;
45 }
46 };
47
48 class LVCHMStream : public LVNamedStream
49 {
50 protected:
51 chmFile* _file;
52 chmUnitInfo m_ui;
53 lvpos_t m_pos;
54 lvpos_t m_size;
55 public:
LVCHMStream(chmFile * file)56 LVCHMStream( chmFile* file )
57 : _file(file), m_pos(0), m_size(0)
58 {
59 }
open(const char * name)60 bool open( const char * name )
61 {
62 memset(&m_ui, 0, sizeof(m_ui));
63 if ( CHM_RESOLVE_SUCCESS==chm_resolve_object(_file, name, &m_ui ) ) {
64 m_size = (lvpos_t)m_ui.length;
65 return true;
66 }
67 return false;
68 }
69
Seek(lvoffset_t offset,lvseek_origin_t origin,lvpos_t * pNewPos)70 virtual lverror_t Seek( lvoffset_t offset, lvseek_origin_t origin, lvpos_t * pNewPos )
71 {
72 //
73 lvpos_t newpos = m_pos;
74 switch ( origin )
75 {
76 case LVSEEK_SET:
77 newpos = offset;
78 break;
79 case LVSEEK_CUR:
80 newpos += offset;
81 break;
82 case LVSEEK_END:
83 newpos = m_size + offset;
84 break;
85 }
86 if ( newpos>m_size )
87 return LVERR_FAIL;
88 if ( pNewPos!=NULL )
89 *pNewPos = newpos;
90 m_pos = newpos;
91 return LVERR_OK;
92 }
93
94 /// Tell current file position
95 /**
96 \param pNewPos points to place to store file position
97 \return lverror_t status: LVERR_OK if success
98 */
Tell(lvpos_t * pPos)99 virtual lverror_t Tell( lvpos_t * pPos )
100 {
101 *pPos = m_pos;
102 return LVERR_OK;
103 }
104
SetPos(lvpos_t p)105 virtual lvpos_t SetPos(lvpos_t p)
106 {
107 if ( p<=m_size ) {
108 m_pos = p;
109 return m_pos;
110 }
111 return (lvpos_t)(~0);
112 }
113
114 /// Get file position
115 /**
116 \return lvpos_t file position
117 */
GetPos()118 virtual lvpos_t GetPos()
119 {
120 return m_pos;
121 }
122
123 /// Get file size
124 /**
125 \return lvsize_t file size
126 */
GetSize()127 virtual lvsize_t GetSize()
128 {
129 return m_size;
130 }
131
GetSize(lvsize_t * pSize)132 virtual lverror_t GetSize( lvsize_t * pSize )
133 {
134 *pSize = m_size;
135 return LVERR_OK;
136 }
137
Read(void * buf,lvsize_t count,lvsize_t * nBytesRead)138 virtual lverror_t Read( void * buf, lvsize_t count, lvsize_t * nBytesRead )
139 {
140 int cnt = (int)count;
141 if ( m_pos + cnt > m_size )
142 cnt = (int)(m_size - m_pos);
143 if ( cnt <= 0 )
144 return LVERR_FAIL;
145 LONGINT64 gotBytes = chm_retrieve_object(_file, &m_ui, (unsigned char *)buf, m_pos, cnt );
146 m_pos += (lvpos_t)gotBytes;
147 if (nBytesRead)
148 *nBytesRead = (lvsize_t)gotBytes;
149 return LVERR_OK;
150 }
151
152
Write(const void *,lvsize_t,lvsize_t *)153 virtual lverror_t Write( const void * /*buf*/, lvsize_t /*count*/, lvsize_t * /*nBytesWritten*/ )
154 {
155 return LVERR_FAIL;
156 }
157
Eof()158 virtual bool Eof()
159 {
160 return (m_pos >= m_size);
161 }
162
SetSize(lvsize_t size)163 virtual lverror_t SetSize( lvsize_t size )
164 {
165 CR_UNUSED(size);
166 // support only size grow
167 return LVERR_FAIL;
168 }
169
170
171 };
172
173 class LVCHMContainer : public LVNamedContainer
174 {
175 protected:
176 //LVDirectoryContainer * m_parent;
177 crChmExternalFileStream _stream;
178 chmFile* _file;
179 public:
OpenStream(const lChar32 * fname,lvopen_mode_t mode)180 virtual LVStreamRef OpenStream( const lChar32 * fname, lvopen_mode_t mode )
181 {
182 LVStreamRef stream;
183 if ( mode!=LVOM_READ )
184 return stream;
185
186 LVCHMStream * p = new LVCHMStream(_file);
187 lString32 fn(fname);
188 if ( fn[0]!='/' )
189 fn = cs32("/") + fn;
190 if ( !p->open( UnicodeToUtf8(lString32(fn)).c_str() )) {
191 delete p;
192 return stream;
193 }
194 stream = p;
195 stream->SetName( fname );
196 return stream;
197 }
GetParentContainer()198 virtual LVContainer * GetParentContainer()
199 {
200 return NULL;
201 }
GetObjectInfo(int index)202 virtual const LVContainerItemInfo * GetObjectInfo(int index)
203 {
204 if (index>=0 && index<m_list.length())
205 return m_list[index];
206 return NULL;
207 }
GetObjectCount() const208 virtual int GetObjectCount() const
209 {
210 return m_list.length();
211 }
GetSize(lvsize_t * pSize)212 virtual lverror_t GetSize( lvsize_t * pSize )
213 {
214 if (m_fname.empty())
215 return LVERR_FAIL;
216 *pSize = GetObjectCount();
217 return LVERR_OK;
218 }
LVCHMContainer(LVStreamRef s)219 LVCHMContainer(LVStreamRef s) : _stream(s), _file(NULL)
220 {
221 }
~LVCHMContainer()222 virtual ~LVCHMContainer()
223 {
224 SetName(NULL);
225 Clear();
226 if ( _file )
227 chm_close( _file );
228 }
229
addFileItem(const char * filename,LONGUINT64 len)230 void addFileItem( const char * filename, LONGUINT64 len )
231 {
232 LVCommonContainerItemInfo * item = new LVCommonContainerItemInfo();
233 item->SetItemInfo( lString32(filename), (lvsize_t)len, 0, false );
234 //CRLog::trace("CHM file item: %s [%d]", filename, (int)len);
235 Add(item);
236 }
237
CHM_ENUMERATOR_CALLBACK(struct chmFile *,struct chmUnitInfo * ui,void * context)238 static int CHM_ENUMERATOR_CALLBACK (struct chmFile * /*h*/,
239 struct chmUnitInfo *ui,
240 void *context)
241 {
242 LVCHMContainer * c = (LVCHMContainer*)context;
243 if ( (ui->flags & CHM_ENUMERATE_FILES) && (ui->flags & CHM_ENUMERATE_NORMAL) ) {
244 c->addFileItem( ui->path, ui->length );
245 }
246 return CHM_ENUMERATOR_CONTINUE;
247 }
248
open()249 bool open()
250 {
251 _file = chm_open( &_stream );
252 if ( !_file )
253 return false;
254 chm_enumerate( _file,
255 CHM_ENUMERATE_ALL,
256 CHM_ENUMERATOR_CALLBACK,
257 this);
258 return true;
259 }
260 };
261
262 /// opens CHM container
LVOpenCHMContainer(LVStreamRef stream)263 LVContainerRef LVOpenCHMContainer( LVStreamRef stream )
264 {
265 LVCHMContainer * chm = new LVCHMContainer(stream);
266 if ( !chm->open() ) {
267 delete chm;
268 return LVContainerRef();
269 }
270 chm->SetName( stream->GetName() );
271 return LVContainerRef( chm );
272 }
273
DetectCHMFormat(LVStreamRef stream)274 bool DetectCHMFormat( LVStreamRef stream )
275 {
276 stream->SetPos(0);
277 LVContainerRef cont = LVOpenCHMContainer( stream );
278 if ( !cont.isNull() ) {
279 return true;
280 }
281 return false;
282 }
283
284 class CHMBinaryReader {
285 LVStreamRef _stream;
286 public:
CHMBinaryReader(LVStreamRef stream)287 CHMBinaryReader( LVStreamRef stream ) : _stream(stream) {
288 }
setPos(int offset)289 bool setPos( int offset ) {
290 return (int)_stream->SetPos(offset) == offset;
291 }
eof()292 bool eof() {
293 return _stream->Eof();
294 }
295
readInt32(bool & error)296 lUInt32 readInt32( bool & error ) {
297 int b1 = _stream->ReadByte();
298 int b2 = _stream->ReadByte();
299 int b3 = _stream->ReadByte();
300 int b4 = _stream->ReadByte();
301 if ( b1==-1 || b2==-1 || b3==-1 || b4==-1 ) {
302 error = true;
303 return 0;
304 }
305 return (lUInt32)(b1 | (b2<<8) | (b3<<16) | (b4<<24));
306 }
readInt16(bool & error)307 lUInt16 readInt16( bool & error ) {
308 int b1 = _stream->ReadByte();
309 int b2 = _stream->ReadByte();
310 if ( b1==-1 || b2==-1 ) {
311 error = true;
312 return 0;
313 }
314 return (lUInt16)(b1 | (b2<<8));
315 }
readInt8(bool & error)316 lUInt8 readInt8( bool & error ) {
317 int b = _stream->ReadByte();
318 if ( b==-1 ) {
319 error = true;
320 return 0;
321 }
322 return (lUInt8)(b & 0xFF);
323 }
bytesLeft()324 int bytesLeft() {
325 return (int)(_stream->GetSize() - _stream->GetPos());
326 }
327
readBytes(LVArray<lUInt8> & bytes,int offset,int length)328 bool readBytes( LVArray<lUInt8> & bytes, int offset, int length ) {
329 bytes.clear();
330 bytes.reserve(length);
331 if ( offset>=0 )
332 if ((int)_stream->SetPos(offset) != offset)
333 return false;
334 for (int i = 0; i < length; i++) {
335 int b = _stream->ReadByte();
336 if ( b==-1 )
337 return false;
338 bytes[i] = (lUInt8)b;
339 }
340 return true;
341 }
342
343 // offset==-1 to avoid changing position, length==-1 to use 0-terminated
readString(int offset,int length)344 lString8 readString( int offset, int length ) {
345 if ( length==0 )
346 return lString8::empty_str;
347 if ( offset>=0 )
348 if ((int)_stream->SetPos(offset) != offset)
349 return lString8::empty_str;
350 lString8 res;
351 if ( length>0 )
352 res.reserve(length);
353 bool zfound = false;
354 for ( int i=0; i<length || length==-1; i++ ) {
355 int b = _stream->ReadByte();
356 if (zfound || (b==0 && length>=0)) {
357 zfound = true;
358 continue;
359 }
360 if ( b==-1 || b==0 )
361 break;
362 res.append(1, (lUInt8)b);
363 }
364 return res;
365 }
366 // offset==-1 to avoid changing position, length==-1 to use 0-terminated
readStringUtf16(int offset,int length)367 lString32 readStringUtf16( int offset, int length ) {
368 if ( length==0 )
369 return lString32::empty_str;
370 if ( offset>=0 )
371 if ((int)_stream->SetPos(offset) != offset)
372 return lString32::empty_str;
373 lString32 res;
374 if ( length>0 )
375 res.reserve(length);
376 for ( int i=0; i<length || length==-1; i++ ) {
377 int b1 = _stream->ReadByte();
378 if ( b1==-1 || b1==0 )
379 break;
380 int b2 = _stream->ReadByte();
381 if ( b2==-1 || b2==0 )
382 break;
383 res.append(1, (lChar32)(b1 | (b2<<16)));
384 }
385 return res;
386 }
readEncInt()387 lInt64 readEncInt() {
388 lInt64 res = 0;
389 int shift = 0;
390 int b = 0;
391 do {
392 b = _stream->ReadByte();
393 if ( b==-1 )
394 return 0;
395 res |= ( ((lInt64)(b&0x7F)) << shift );
396 shift+=7;
397 } while ( b&0x80 );
398 return res;
399 }
400 };
401
402 class CHMUrlStrEntry {
403 public:
404 lUInt32 offset;
405 lString8 url;
406 };
407
408 const int URLSTR_BLOCK_SIZE = 0x1000;
409 class CHMUrlStr {
410 LVContainerRef _container;
411 CHMBinaryReader _reader;
412 LVPtrVector<CHMUrlStrEntry> _table;
413
CHMUrlStr(LVContainerRef container,LVStreamRef stream)414 CHMUrlStr( LVContainerRef container, LVStreamRef stream ) : _container(container), _reader(stream)
415 {
416
417 }
readInt32(const lUInt8 * & data)418 lUInt32 readInt32( const lUInt8 * & data ) {
419 lUInt32 res = 0;
420 res = *(data++);
421 res = res | (((lUInt32)(*(data++))) << 8);
422 res = res | (((lUInt32)(*(data++))) << 16);
423 res = res | (((lUInt32)(*(data++))) << 24);
424 return res;
425 }
readString(const lUInt8 * & data,int maxlen)426 lString8 readString( const lUInt8 * & data, int maxlen ) {
427 lString8 res;
428 for ( int i=0; i<maxlen; i++ ) {
429 lUInt8 b = *data++;
430 if ( b==0 )
431 break;
432 res.append(1, b);
433 }
434 return res;
435 }
436
437
decodeBlock(const lUInt8 * ptr,lUInt32 blockOffset,int size)438 bool decodeBlock( const lUInt8 * ptr, lUInt32 blockOffset, int size ) {
439 const lUInt8 * data = ptr;
440 const lUInt8 * maxdata = ptr + size;
441 while ( data + 8 < maxdata ) {
442 lUInt32 offset = (lUInt32)(blockOffset + (data - ptr));
443 //lUInt32 urlOffset =
444 readInt32(data);
445 //lUInt32 frameOffset =
446 readInt32(data);
447 if ( data < maxdata ) { //urlOffset > offset ) {
448 CHMUrlStrEntry * item = new CHMUrlStrEntry();
449 item->offset = offset;
450 item->url = readString(data, (int)(maxdata - data));
451 //CRLog::trace("urlstr[offs=%x, url=%s]", item->offset, item->url.c_str());
452 _table.add( item );
453 }
454 }
455 return true;
456 }
457
read()458 bool read() {
459 bool err = false;
460 LVArray<lUInt8> bytes;
461 _reader.readInt8(err);
462 lUInt32 offset = 1;
463 while ( !_reader.eof() && !err ) {
464 int sz = _reader.bytesLeft();
465 if ( sz>URLSTR_BLOCK_SIZE )
466 sz = URLSTR_BLOCK_SIZE;
467 err = !_reader.readBytes(bytes, -1, sz) || err;
468 if ( err )
469 break;
470 err = !decodeBlock( bytes.get(), offset, sz ) || err;
471 offset += sz;
472 }
473 return !err;
474 }
475 public:
open(LVContainerRef container)476 static CHMUrlStr * open( LVContainerRef container ) {
477 LVStreamRef stream = container->OpenStream(U"#URLSTR", LVOM_READ);
478 if ( stream.isNull() )
479 return NULL;
480 CHMUrlStr * res = new CHMUrlStr( container, stream );
481 if ( !res->read() ) {
482 delete res;
483 return NULL;
484 }
485 CRLog::info("CHM URLSTR: %d entries read", res->_table.length());
486 return res;
487 }
findByOffset(lUInt32 offset)488 lString8 findByOffset( lUInt32 offset ) {
489 for ( int i=0; i<_table.length(); i++ ) {
490 if ( _table[i]->offset==offset )
491 return _table[i]->url;
492 }
493 return lString8::empty_str;
494 }
getUrlList(lString32Collection & urlList)495 void getUrlList( lString32Collection & urlList ) {
496 for ( int i=0; i<_table.length(); i++ ) {
497 lString8 s = _table[i]->url;
498 if ( !s.empty() ) {
499 urlList.add(Utf8ToUnicode(s));
500 }
501 }
502 }
503 };
504
505 class CHMUrlTableEntry {
506 public:
507 lUInt32 offset;
508 lUInt32 id;
509 lUInt32 topicsIndex;
510 lUInt32 urlStrOffset;
CHMUrlTableEntry()511 CHMUrlTableEntry()
512 : offset(0)
513 , id(0)
514 , topicsIndex(0)
515 , urlStrOffset(0)
516 {
517
518 }
519 };
520
521 const int URLTBL_BLOCK_SIZE = 0x1000;
522 const int URLTBL_BLOCK_RECORD_COUNT = 341;
523 class CHMUrlTable {
524 LVContainerRef _container;
525 CHMBinaryReader _reader;
526 LVPtrVector<CHMUrlTableEntry> _table;
527 CHMUrlStr * _strings;
528
529
CHMUrlTable(LVContainerRef container,LVStreamRef stream)530 CHMUrlTable( LVContainerRef container, LVStreamRef stream ) : _container(container), _reader(stream), _strings(NULL)
531 {
532
533 }
readInt32(const lUInt8 * & data)534 lUInt32 readInt32( const lUInt8 * & data ) {
535 lUInt32 res = 0;
536 res = *(data++);
537 res = res | (((lUInt32)(*(data++))) << 8);
538 res = res | (((lUInt32)(*(data++))) << 16);
539 res = res | (((lUInt32)(*(data++))) << 24);
540 return res;
541 }
542
decodeBlock(const lUInt8 * data,lUInt32 offset,int size)543 bool decodeBlock( const lUInt8 * data, lUInt32 offset, int size ) {
544 for ( int i=0; i<URLTBL_BLOCK_RECORD_COUNT && size>0; i++ ) {
545 CHMUrlTableEntry * item = new CHMUrlTableEntry();
546 item->offset = offset;
547 item->id = readInt32(data);
548 item->topicsIndex = readInt32(data);
549 item->urlStrOffset = readInt32(data);
550 //CRLog::trace("urltbl[offs=%x, id=%x, ti=%x, urloffs=%x]", item->offset, item->id, item->topicsIndex, item->urlStrOffset);
551 _table.add( item );
552 offset += 4*3;
553 size -= 4*3;
554 }
555 return true;
556 }
557
read()558 bool read() {
559 bool err = false;
560 LVArray<lUInt8> bytes;
561 lUInt32 offset = 0;
562 while ( !_reader.eof() && !err ) {
563 int sz = _reader.bytesLeft();
564 if ( sz>URLTBL_BLOCK_SIZE )
565 sz = URLTBL_BLOCK_SIZE;
566 err = !_reader.readBytes(bytes, -1, sz) || err;
567 if ( err )
568 break;
569 err = !decodeBlock( bytes.get(), offset, sz ) || err;
570 offset += sz;
571 }
572 _strings = CHMUrlStr::open(_container);
573 if ( !_strings ) {
574 CRLog::warn("CHM: cannot read #URLSTR");
575 }
576 return !err;
577 }
578 public:
~CHMUrlTable()579 ~CHMUrlTable() {
580 if ( _strings )
581 delete _strings;
582 }
583
open(LVContainerRef container)584 static CHMUrlTable * open( LVContainerRef container ) {
585 LVStreamRef stream = container->OpenStream(U"#URLTBL", LVOM_READ);
586 if ( stream.isNull() )
587 return NULL;
588 CHMUrlTable * res = new CHMUrlTable( container, stream );
589 if ( !res->read() ) {
590 delete res;
591 return NULL;
592 }
593 CRLog::info("CHM URLTBL: %d entries read", res->_table.length());
594 return res;
595 }
596
urlById(lUInt32 id)597 lString8 urlById( lUInt32 id ) {
598 if ( !_strings )
599 return lString8::empty_str;
600 for ( int i=0; i<_table.length(); i++ ) {
601 if ( _table[i]->id==id )
602 return _strings->findByOffset( _table[i]->urlStrOffset );
603 }
604 return lString8::empty_str;
605 }
606
findById(lUInt32 id)607 CHMUrlTableEntry * findById( lUInt32 id ) {
608 for ( int i=0; i<_table.length(); i++ ) {
609 if ( _table[i]->id==id )
610 return _table[i];
611 }
612 return NULL;
613 }
findByOffset(lUInt32 offset)614 CHMUrlTableEntry * findByOffset( lUInt32 offset ) {
615 for ( int i=0; i<_table.length(); i++ ) {
616 if ( _table[i]->offset==offset )
617 return _table[i];
618 }
619 return NULL;
620 }
621
getUrlList(lString32Collection & urlList)622 void getUrlList( lString32Collection & urlList ) {
623 if ( !_strings )
624 return;
625 _strings->getUrlList( urlList );
626 // for ( int i=0; i<_table.length(); i++ ) {
627 // lString8 s = _strings->findByOffset( _table[i]->urlStrOffset );
628 // if ( !s.empty() ) {
629 // urlList.add(Utf8ToUnicode(s));
630 // }
631 // }
632 }
633 };
634
635 class CHMSystem {
636
637 LVContainerRef _container;
638 CHMBinaryReader _reader;
639 lUInt32 _fileVersion;
640 lString8 _contentsFile;
641 lString8 _indexFile;
642 lString8 _defaultTopic;
643 lString8 _title;
644 lString8 _language;
645 lString8 _defaultFont;
646 lUInt32 _lcid;
647 bool _dbcs;
648 bool _fullTextSearch;
649 bool _hasKLinks;
650 bool _hasALinks;
651 lUInt32 _binaryIndexURLTableId;
652 lUInt32 _binaryTOCURLTableId;
653 const lChar32 * _enc_table;
654 lString32 _enc_name;
655 CHMUrlTable * _urlTable;
656
CHMSystem(LVContainerRef container,LVStreamRef stream)657 CHMSystem( LVContainerRef container, LVStreamRef stream ) : _container(container), _reader(stream)
658 , _fileVersion(0)
659 , _lcid(0)
660 , _dbcs(false)
661 , _fullTextSearch(false)
662 , _hasKLinks(false)
663 , _hasALinks(false)
664 , _binaryIndexURLTableId(0)
665 , _binaryTOCURLTableId(0)
666 , _enc_table(NULL)
667 , _urlTable(NULL)
668 {
669 }
670
decodeEntry()671 bool decodeEntry() {
672 bool err = false;
673 int code = _reader.readInt16(err);
674 int length = _reader.readInt16(err);
675 //CRLog::trace("CHM binary item code=%d, length=%d, bytesLeft=%d", code, length, _reader.bytesLeft());
676 if ( err )
677 return false;
678 LVArray<lUInt8> bytes;
679 switch( code ) {
680 case 0:
681 _contentsFile = _reader.readString(-1, length);
682 break;
683 case 1:
684 _indexFile = _reader.readString(-1, length);
685 break;
686 case 2:
687 _defaultTopic = _reader.readString(-1, length);
688 break;
689 case 3:
690 _title = _reader.readString(-1, length);
691 break;
692 case 4:
693 {
694 _lcid = _reader.readInt32(err);
695 int codepage = langToCodepage( _lcid );
696 const lChar32 * enc_name = GetCharsetName( codepage );
697 const lChar32 * table = GetCharsetByte2UnicodeTable( codepage );
698 _language = langToLanguage( _lcid );
699 if ( enc_name!=NULL ) {
700 _enc_table = table;
701 _enc_name = lString32(enc_name);
702 CRLog::info("CHM LCID: %08x, charset=%s", _lcid, LCSTR(_enc_name));
703 } else {
704 CRLog::info("CHM LCID: %08x -- cannot find charset encoding table", _lcid);
705 }
706 _dbcs = _reader.readInt32(err)==1;
707 _fullTextSearch = _reader.readInt32(err)==1;
708 _hasKLinks = _reader.readInt32(err)==1;
709 _hasALinks = _reader.readInt32(err)==1;
710 err = !_reader.readBytes(bytes, -1, length - (5*4)) || err;
711 }
712 break;
713 case 7:
714 if ( _fileVersion>2 )
715 _binaryIndexURLTableId = _reader.readInt32(err);
716 else
717 err = !_reader.readBytes(bytes, -1, length) || err;
718 break;
719 case 11:
720 if ( _fileVersion>2 )
721 _binaryTOCURLTableId = _reader.readInt32(err);
722 else
723 err = !_reader.readBytes(bytes, -1, length) || err;
724 break;
725 case 16:
726 _defaultFont = _reader.readString(-1, length);
727 CRLog::info("CHM default font: %s", _defaultFont.c_str());
728 if ( _enc_table==NULL ) {
729 for ( int i=_defaultFont.length()-1; i>0; i-- ) {
730 if ( _defaultFont[i]==',' ) {
731 int cs = _defaultFont.substr(i+1, _defaultFont.length()-i-1).atoi();
732 const lChar32 * cpname = NULL;
733 switch (cs) {
734 case 0x00: cpname = U"windows-1252"; break;
735 case 0xCC: cpname = U"windows-1251"; break;
736 case 0xEE: cpname = U"windows-1250"; break;
737 case 0xA1: cpname = U"windows-1253"; break;
738 case 0xA2: cpname = U"windows-1254"; break;
739 case 0xBA: cpname = U"windows-1257"; break;
740 case 0xB1: cpname = U"windows-1255"; break;
741 case 0xB2: cpname = U"windows-1256"; break;
742 default: break;
743 }
744 const lChar32 * table = GetCharsetByte2UnicodeTable( cpname );
745 if ( cpname!=NULL && table!=NULL ) {
746 CRLog::info("CHM charset detected from default font: %s", LCSTR(lString32(cpname)));
747 _enc_table = table;
748 _enc_name = lString32(cpname);
749 }
750 break;
751 }
752 }
753 }
754 break;
755 default:
756 err = !_reader.readBytes(bytes, -1, length) || err;
757 break;
758 }
759 return !err;
760 }
761
read()762 bool read() {
763 bool err = false;
764 _fileVersion = _reader.readInt32(err);
765 int count = 0;
766 while ( !_reader.eof() && !err ) {
767 err = !decodeEntry() || err;
768 if ( !err )
769 count++;
770 }
771
772 if ( err ) {
773 CRLog::error("CHM decoding error: %d blocks decoded, stream bytes left=%d", count, _reader.bytesLeft() );
774 return false;
775 }
776 if ( _enc_table==NULL ) {
777 _enc_table = GetCharsetByte2UnicodeTable( 1252 );
778 _enc_name = cs32("windows-1252");
779 }
780 _urlTable = CHMUrlTable::open(_container);
781 return !err;
782 }
783
784 public:
~CHMSystem()785 ~CHMSystem() {
786 if ( _urlTable!=NULL )
787 delete _urlTable;
788 }
789
open(LVContainerRef container)790 static CHMSystem * open( LVContainerRef container ) {
791 LVStreamRef stream = container->OpenStream(U"#SYSTEM", LVOM_READ);
792 if ( stream.isNull() )
793 return NULL;
794 CHMSystem * res = new CHMSystem( container, stream );
795 if ( !res->read() ) {
796 delete res;
797 return NULL;
798 }
799 return res;
800 }
801
decodeString(const lString8 & str)802 lString32 decodeString( const lString8 & str ) {
803 return ByteToUnicode( str, _enc_table );
804 }
805
getTitle()806 lString32 getTitle() {
807 return decodeString(_title);
808 }
809
getLanguage()810 lString32 getLanguage() {
811 return decodeString(_language);
812 }
813
getDefaultTopic()814 lString32 getDefaultTopic() {
815 return decodeString(_defaultTopic);
816 }
817
getEncodingName()818 lString32 getEncodingName() {
819 return _enc_name;
820 }
821
getContentsFileName()822 lString32 getContentsFileName() {
823 if ( _binaryTOCURLTableId!=0 ) {
824 lString8 url = _urlTable->urlById(_binaryTOCURLTableId);
825 if ( !url.empty() )
826 return decodeString(url);
827 }
828 if ( _contentsFile.empty() ) {
829 lString32 hhcName;
830 int bestSize = 0;
831 for ( int i=0; i<_container->GetObjectCount(); i++ ) {
832 const LVContainerItemInfo * item = _container->GetObjectInfo(i);
833 if ( !item->IsContainer() ) {
834 lString32 name = item->GetName();
835 int sz = item->GetSize();
836 //CRLog::trace("CHM item: %s", LCSTR(name));
837 lString32 lname = name;
838 lname.lowercase();
839 if ( lname.endsWith(".hhc") ) {
840 if ( sz > bestSize ) {
841 hhcName = name;
842 bestSize = sz;
843 }
844 }
845 }
846 }
847 if ( !hhcName.empty() )
848 return hhcName;
849 }
850 return decodeString(_contentsFile);
851 }
getUrlList(lString32Collection & urlList)852 void getUrlList( lString32Collection & urlList ) {
853 if ( !_urlTable )
854 return;
855 _urlTable->getUrlList(urlList);
856 }
857 };
858
LVParseCHMHTMLStream(LVStreamRef stream,lString32 defEncodingName)859 ldomDocument * LVParseCHMHTMLStream( LVStreamRef stream, lString32 defEncodingName )
860 {
861 if ( stream.isNull() )
862 return NULL;
863
864 // detect encondig
865 stream->SetPos(0);
866
867 #if 0
868 ldomDocument * encDetectionDoc = LVParseHTMLStream( stream );
869 int encoding = 0;
870 if ( encDetectionDoc!=NULL ) {
871 ldomNode * node = encDetectionDoc->nodeFromXPath(U"/html/body/object[1]");
872 if ( node!=NULL ) {
873 for ( int i=0; i<node->getChildCount(); i++ ) {
874 ldomNode * child = node->getChildNode(i);
875 if (child && child->isElement() && child->getNodeName() == "param" && child->getAttributeValue(U"name") == "Font") {
876 lString32 s = child->getAttributeValue(U"value");
877 lString32 lastDigits;
878 for ( int i=s.length()-1; i>=0; i-- ) {
879 lChar32 ch = s[i];
880 if ( ch>='0' && ch<='9' )
881 lastDigits.insert(0, 1, ch);
882 else
883 break;
884 }
885 encoding = lastDigits.atoi();
886 CRLog::debug("LVParseCHMHTMLStream: encoding detected: %d", encoding);
887 }
888 }
889 }
890 delete encDetectionDoc;
891 }
892 const lChar32 * enc = U"cp1252";
893 if ( encoding==1 ) {
894 enc = U"cp1251";
895 }
896 #endif
897
898 stream->SetPos(0);
899 bool error = true;
900 ldomDocument * doc;
901 doc = new ldomDocument();
902 doc->setDocFlags( 0 );
903
904 ldomDocumentWriterFilter writerFilter(doc, false, HTML_AUTOCLOSE_TABLE);
905 writerFilter.setFlags(writerFilter.getFlags() | TXTFLG_CONVERT_8BIT_ENTITY_ENCODING);
906
907 /// FB2 format
908 LVFileFormatParser * parser = new LVHTMLParser(stream, &writerFilter);
909 if ( !defEncodingName.empty() )
910 parser->SetCharset(defEncodingName.c_str());
911 if ( parser->CheckFormat() ) {
912 if ( parser->Parse() ) {
913 error = false;
914 }
915 }
916 delete parser;
917 if ( error ) {
918 delete doc;
919 doc = NULL;
920 }
921 return doc;
922 }
923
filename_comparator(lString32 & _s1,lString32 & _s2)924 static int filename_comparator(lString32 & _s1, lString32 & _s2) {
925 lString32 s1 = _s1.substr(1);
926 lString32 s2 = _s2.substr(1);
927 if (s1.endsWith(".htm"))
928 s1.erase(s1.length()-4, 4);
929 else if (s1.endsWith(".html"))
930 s1.erase(s1.length()-5, 5);
931 if (s2.endsWith(".htm"))
932 s2.erase(s2.length()-4, 4);
933 else if (s2.endsWith(".html"))
934 s2.erase(s2.length()-5, 5);
935 if (s1 == "index")
936 return -1;
937 else if (s2 == "index")
938 return 1;
939 if (s1 == "header")
940 return -1;
941 else if (s2 == "header")
942 return 1;
943 int d1 = 0;
944 int d2 = 0;
945 s1.atoi(d1);
946 s2.atoi(d2);
947 if (d1 || d2) {
948 if (d1 && d2) {
949 if (d1 < d2)
950 return -1;
951 else if (d1 > d2)
952 return 1;
953 return 0;
954 } else if (d1) {
955 return -1;
956 } else {
957 return 1;
958 }
959 }
960 return s1.compare(s2);
961 }
962
963 class CHMTOCReader {
964 LVContainerRef _cont;
965 ldomDocumentFragmentWriter * _appender;
966 ldomDocument * _doc;
967 LVTocItem * _toc;
968 lString32HashedCollection _fileList;
969 lString32 lastFile;
970 lString32 _defEncodingName;
971 bool _fakeToc;
972 public:
CHMTOCReader(LVContainerRef cont,ldomDocument * doc,ldomDocumentFragmentWriter * appender)973 CHMTOCReader( LVContainerRef cont, ldomDocument * doc, ldomDocumentFragmentWriter * appender )
974 : _cont(cont), _appender(appender), _doc(doc), _fileList(1024)
975 {
976 _toc = _doc->getToc();
977 }
addFile(const lString32 & v1)978 void addFile( const lString32 & v1 ) {
979 int index = _fileList.find(v1.c_str());
980 if ( index>=0 )
981 return; // already added
982 _fileList.add(v1.c_str());
983 CRLog::trace("New source file: %s", LCSTR(v1) );
984 _appender->addPathSubstitution( v1, cs32("_doc_fragment_") + fmt::decimal(_fileList.length()) );
985 _appender->setCodeBase( v1 );
986 }
987
addTocItem(lString32 name,lString32 url,int level)988 void addTocItem( lString32 name, lString32 url, int level )
989 {
990 //CRLog::trace("CHM toc level %d: '%s' : %s", level, LCSTR(name), LCSTR(url) );
991 if (url.startsWith(".."))
992 url = LVExtractFilename( url );
993 lString32 v1, v2;
994 if ( !url.split2(cs32("#"), v1, v2) )
995 v1 = url;
996 PreProcessXmlString( name, 0 );
997 addFile(v1);
998 lString32 url2 = _appender->convertHref(url);
999 //CRLog::trace("new url: %s", LCSTR(url2) );
1000 while ( _toc->getLevel()>level && _toc->getParent() )
1001 _toc = _toc->getParent();
1002 _toc = _toc->addChild(name, ldomXPointer(), url2);
1003 }
1004
recurseToc(ldomNode * node,int level)1005 void recurseToc( ldomNode * node, int level )
1006 {
1007 lString32 nodeName = node->getNodeName();
1008 lUInt16 paramElemId = node->getDocument()->getElementNameIndex(U"param");
1009 if (nodeName == "object") {
1010 if ( level>0 ) {
1011 // process object
1012 if (node->getAttributeValue("type") == "text/sitemap") {
1013 lString32 name, local;
1014 int cnt = node->getChildCount();
1015 for ( int i=0; i<cnt; i++ ) {
1016 ldomNode * child = node->getChildElementNode(i, paramElemId);
1017 if ( child ) {
1018 lString32 paramName = child->getAttributeValue("name");
1019 lString32 paramValue = child->getAttributeValue("value");
1020 if (paramName == "Name")
1021 name = paramValue;
1022 else if (paramName == "Local")
1023 local = paramValue;
1024 }
1025 }
1026 if ( !local.empty() && !name.empty() ) {
1027 // found!
1028 addTocItem( name, local, level );
1029 }
1030 }
1031 }
1032 return;
1033 }
1034 if (nodeName == "ul")
1035 level++;
1036 int cnt = node->getChildCount();
1037 for ( int i=0; i<cnt; i++ ) {
1038 ldomNode * child = node->getChildElementNode(i);
1039 if ( child ) {
1040 recurseToc( child, level );
1041 }
1042 }
1043 }
1044
init(LVContainerRef cont,lString32 hhcName,lString32 defEncodingName,lString32Collection & urlList,lString32 mainPageName)1045 bool init( LVContainerRef cont, lString32 hhcName, lString32 defEncodingName, lString32Collection & urlList, lString32 mainPageName )
1046 {
1047 if ( hhcName.empty() && urlList.length()==0 ) {
1048 lString32Collection htms;
1049 for (int i=0; i<cont->GetObjectCount(); i++) {
1050 const LVContainerItemInfo * item = cont->GetObjectInfo(i);
1051 if (item->IsContainer())
1052 continue;
1053 lString32 name = item->GetName();
1054 if (name == "/bookindex.htm" || name == "/headerindex.htm")
1055 continue;
1056 //CRLog::trace("item %d : %s", i, LCSTR(name));
1057 if (name.endsWith(".htm") || name.endsWith(".html"))
1058 htms.add(name);
1059 }
1060 if (!htms.length())
1061 return false;
1062 // {
1063 // for (int j=0; j<htms.length(); j++) {
1064 // CRLog::trace("unsorted %d : %s", j, LCSTR(htms[j]));
1065 // }
1066 // }
1067 htms.sort(filename_comparator);
1068 // {
1069 // for (int j=0; j<htms.length(); j++) {
1070 // CRLog::trace("sorted %d : %s", j, LCSTR(htms[j]));
1071 // }
1072 // }
1073 urlList.addAll(htms);
1074 }
1075 _defEncodingName = defEncodingName;
1076
1077 if ( !mainPageName.empty() )
1078 addFile(mainPageName);
1079
1080 if ( hhcName.empty() ) {
1081 _fakeToc = true;
1082 for ( int i=0; i<urlList.length(); i++ ) {
1083 //lString32 name = lString32::itoa(i+1);
1084 lString32 name = urlList[i];
1085 if ( name.endsWith(".htm") )
1086 name = name.substr(0, name.length()-4);
1087 else if ( name.endsWith(".html") )
1088 name = name.substr(0, name.length()-5);
1089 if (name.startsWith("/"))
1090 name = name.substr(1);
1091 addTocItem( name, urlList[i], 0 );
1092 }
1093 return true;
1094 } else {
1095 _fakeToc = false;
1096 LVStreamRef tocStream = cont->OpenStream(hhcName.c_str(), LVOM_READ);
1097 if ( tocStream.isNull() ) {
1098 CRLog::error("CHM: Cannot open .hhc");
1099 return false;
1100 }
1101 ldomDocument * doc = LVParseCHMHTMLStream( tocStream, defEncodingName );
1102 if ( !doc ) {
1103 CRLog::error("CHM: Cannot parse .hhc");
1104 return false;
1105 }
1106
1107 #if DUMP_CHM_DOC==1
1108 LVStreamRef out = LVOpenFileStream(U"/tmp/chm-toc.html", LVOM_WRITE);
1109 if ( !out.isNull() )
1110 doc->saveToStream( out, NULL, true );
1111 #endif
1112
1113 ldomNode * body = doc->getRootNode(); //doc->createXPointer(cs32("/html[1]/body[1]"));
1114 bool res = false;
1115 if ( body->isElement() ) {
1116 // body element
1117 recurseToc( body, 0 );
1118 // add rest of pages
1119 for ( int i=0; i<urlList.length(); i++ ) {
1120 lString32 name = urlList[i];
1121 if ( name.endsWith(".htm") || name.endsWith(".html") )
1122 addFile(name);
1123 }
1124
1125 res = _fileList.length()>0;
1126 while ( _toc && _toc->getParent() )
1127 _toc = _toc->getParent();
1128 if ( res && _toc && _toc->getChildCount()>0 ) {
1129 lString32 name = _toc->getChild(0)->getName();
1130 CRPropRef m_doc_props = _doc->getProps();
1131 m_doc_props->setString(DOC_PROP_TITLE, name);
1132 }
1133 }
1134 delete doc;
1135 return res;
1136 }
1137 }
appendFragments(LVDocViewCallback * progressCallback)1138 int appendFragments( LVDocViewCallback * progressCallback )
1139 {
1140 int appendedFragments = 0;
1141 time_t lastProgressTime = (time_t)time(0);
1142 int lastProgressPercent = -1;
1143 int cnt = _fileList.length();
1144 for ( int i=0; i<cnt; i++ ) {
1145 if ( progressCallback ) {
1146 int percent = i * 100 / cnt;
1147 time_t ts = (time_t)time(0);
1148 if ( ts>lastProgressTime && percent>lastProgressPercent ) {
1149 progressCallback->OnLoadFileProgress( percent );
1150 lastProgressTime = ts;
1151 lastProgressPercent = percent;
1152 }
1153 }
1154 lString32 fname = _fileList[i];
1155 CRLog::trace("Import file %s", LCSTR(fname));
1156 LVStreamRef stream = _cont->OpenStream(fname.c_str(), LVOM_READ);
1157 if ( stream.isNull() )
1158 continue;
1159 _appender->setCodeBase(fname);
1160 LVHTMLParser parser(stream, _appender);
1161 parser.SetCharset(_defEncodingName.c_str());
1162 if ( parser.CheckFormat() && parser.Parse() ) {
1163 // valid
1164 appendedFragments++;
1165 } else {
1166 CRLog::error("Document type is not HTML for fragment %s", LCSTR(fname));
1167 }
1168 appendedFragments++;
1169 }
1170 return appendedFragments;
1171 }
1172 };
1173
ImportCHMDocument(LVStreamRef stream,ldomDocument * doc,LVDocViewCallback * progressCallback,CacheLoadingCallback * formatCallback)1174 bool ImportCHMDocument( LVStreamRef stream, ldomDocument * doc, LVDocViewCallback * progressCallback, CacheLoadingCallback * formatCallback )
1175 {
1176 stream->SetPos(0);
1177 LVContainerRef cont = LVOpenCHMContainer( stream );
1178 if ( cont.isNull() ) {
1179 stream->SetPos(0);
1180 return false;
1181 }
1182 doc->setContainer(cont);
1183
1184 #if BUILD_LITE!=1
1185 if ( doc->openFromCache(formatCallback) ) {
1186 if ( progressCallback ) {
1187 progressCallback->OnLoadFileEnd( );
1188 }
1189 return true;
1190 }
1191 #endif
1192
1193 CHMSystem * chm = CHMSystem::open(cont);
1194 if ( !chm )
1195 return false;
1196 lString32 tocFileName = chm->getContentsFileName();
1197 lString32 defEncodingName = chm->getEncodingName();
1198 lString32 mainPageName = chm->getDefaultTopic();
1199 lString32 title = chm->getTitle();
1200 lString32 language = chm->getLanguage();
1201 CRLog::info("CHM: toc=%s, enc=%s, title=%s", LCSTR(tocFileName), LCSTR(defEncodingName), LCSTR(title));
1202 //
1203 lString32Collection urlList;
1204 chm->getUrlList(urlList);
1205 delete chm;
1206
1207 int fragmentCount = 0;
1208 ldomDocumentWriterFilter writer(doc, false, HTML_AUTOCLOSE_TABLE);
1209 //ldomDocumentWriter writer(doc);
1210 writer.OnStart(NULL);
1211 writer.OnTagOpenNoAttr(U"", U"body");
1212 ldomDocumentFragmentWriter appender(&writer, cs32("body"), cs32("DocFragment"), lString32::empty_str );
1213 CHMTOCReader tocReader(cont, doc, &appender);
1214 if ( !tocReader.init(cont, tocFileName, defEncodingName, urlList, mainPageName) )
1215 return false;
1216
1217 if ( !title.empty() )
1218 doc->getProps()->setString(DOC_PROP_TITLE, title);
1219 if ( !language.empty() )
1220 doc->getProps()->setString(DOC_PROP_LANGUAGE, language);
1221
1222 fragmentCount = tocReader.appendFragments( progressCallback );
1223 writer.OnTagClose(U"", U"body");
1224 writer.OnStop();
1225 CRLog::debug("CHM: %d documents merged", fragmentCount);
1226 #if DUMP_CHM_DOC==1
1227 LVStreamRef out = LVOpenFileStream(U"/tmp/chm.html", LVOM_WRITE);
1228 if ( !out.isNull() )
1229 doc->saveToStream( out, NULL, true );
1230 #endif
1231
1232 return fragmentCount>0;
1233 }
1234
1235 #endif
1236