1 /* This file is (c) 2013 Timon Wong <timon86.wang AT gmail DOT com>
2 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
3
4 #include "mdx.hh"
5 #include "btreeidx.hh"
6 #include "folding.hh"
7 #include "utf8.hh"
8 #include "file.hh"
9 #include "wstring.hh"
10 #include "wstring_qt.hh"
11 #include "chunkedstorage.hh"
12 #include "gddebug.hh"
13 #include "langcoder.hh"
14 #include "fsencoding.hh"
15 #include "audiolink.hh"
16 #include "ex.hh"
17 #include "mdictparser.hh"
18 #include "filetype.hh"
19 #include "ftshelpers.hh"
20 #include "htmlescape.hh"
21
22 #include <algorithm>
23 #include <map>
24 #include <set>
25 #include <list>
26 #include <ctype.h>
27 #include <stdlib.h>
28
29 #ifdef _MSC_VER
30 #include <stub_msvc.h>
31 #endif
32
33 #include <QDir>
34 #include <QString>
35 #include <QSemaphore>
36 #include <QThreadPool>
37 #include <QAtomicInt>
38 #include <QTextDocument>
39 #include <QCryptographicHash>
40
41 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
42 #include <QRegularExpression>
43 #endif
44
45 #include "qt4x5.hh"
46
47 namespace Mdx
48 {
49
50 using std::map;
51 using std::multimap;
52 using std::set;
53 using gd::wstring;
54 using gd::wchar;
55 using std::list;
56 using std::pair;
57 using std::string;
58
59 using BtreeIndexing::WordArticleLink;
60 using BtreeIndexing::IndexedWords;
61 using BtreeIndexing::IndexInfo;
62
63 using namespace Mdict;
64
65 enum
66 {
67 kSignature = 0x4349444d, // MDIC
68 kCurrentFormatVersion = 11 + BtreeIndexing::FormatVersion + Folding::Version
69 };
70
71 DEF_EX( exCorruptDictionary, "dictionary file was tampered or corrupted", std::exception )
72
73 struct IdxHeader
74 {
75 uint32_t signature; // First comes the signature, MDIC
76 uint32_t formatVersion; // File format version, currently 1.
77 uint32_t parserVersion; // Version of the parser used to parse the MDIC file.
78 // Version of the folding algorithm used when building
79 // index. If it's different from the current one,
80 // the file is to be rebuilt.
81 uint32_t foldingVersion;
82
83 uint32_t articleCount; // Total number of articles, for informative purposes only
84 uint32_t wordCount; // Total number of words, for informative purposes only
85
86 uint32_t isRightToLeft; // Right to left
87 uint32_t chunksOffset; // The offset to chunks' storage
88
89 uint32_t descriptionAddress; // Address of the dictionary description in the chunks' storage
90 uint32_t descriptionSize; // Size of the description in the chunks' storage, 0 = no description
91
92 uint32_t styleSheetAddress;
93 uint32_t styleSheetCount;
94
95 uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
96 uint32_t indexRootOffset;
97
98 uint32_t langFrom; // Source language
99 uint32_t langTo; // Target language
100
101 uint32_t mddIndexInfosOffset; // address of IndexInfos for resource files (.mdd)
102 uint32_t mddIndexInfosCount; // count of IndexInfos for resource files
103 }
104 #ifndef _MSC_VER
105 __attribute__( ( packed ) )
106 #endif
107 ;
108
109 // A helper method to read resources from .mdd file
110 class IndexedMdd: public BtreeIndexing::BtreeIndex
111 {
112 Mutex & idxMutex;
113 Mutex fileMutex;
114 ChunkedStorage::Reader & chunks;
115 QFile mddFile;
116 bool isFileOpen;
117
118 public:
119
IndexedMdd(Mutex & idxMutex,ChunkedStorage::Reader & chunks)120 IndexedMdd( Mutex & idxMutex, ChunkedStorage::Reader & chunks ):
121 idxMutex( idxMutex ),
122 chunks( chunks ),
123 isFileOpen( false )
124 {}
125
126 /// Opens the index. The values are those previously returned by buildIndex().
127 using BtreeIndexing::BtreeIndex::openIndex;
128
129 /// Opens the mdd file itself. Returns true if succeeded, false otherwise.
open(const char * fileName)130 bool open( const char * fileName )
131 {
132 mddFile.setFileName( QString::fromUtf8( fileName ) );
133 isFileOpen = mddFile.open( QFile::ReadOnly );
134 return isFileOpen;
135 }
136
137 /// Returns true if the mdd is open, false otherwise.
isOpen() const138 inline bool isOpen() const
139 {
140 return isFileOpen;
141 }
142
143 /// Checks whether the given file exists in the mdd file or not.
144 /// Note that this function is thread-safe, since it does not access mdd file.
hasFile(gd::wstring const & name)145 bool hasFile( gd::wstring const & name )
146 {
147 if ( !isFileOpen )
148 return false;
149 vector< WordArticleLink > links = findArticles( name );
150 return !links.empty();
151 }
152
153 /// Attempts loading the given file into the given vector. Returns true on
154 /// success, false otherwise.
loadFile(gd::wstring const & name,std::vector<char> & result)155 bool loadFile( gd::wstring const & name, std::vector< char > & result )
156 {
157 if ( !isFileOpen )
158 return false;
159
160 vector< WordArticleLink > links = findArticles( name );
161 if ( links.empty() )
162 return false;
163
164 MdictParser::RecordInfo indexEntry;
165 vector< char > chunk;
166 Mutex::Lock _( idxMutex );
167 const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk );
168 memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) );
169
170 ScopedMemMap compressed( mddFile, indexEntry.compressedBlockPos, indexEntry.compressedBlockSize );
171 if ( !compressed.startAddress() )
172 {
173 return false;
174 }
175
176 QByteArray decompressed;
177 if ( !MdictParser::parseCompressedBlock( indexEntry.compressedBlockSize, ( char * )compressed.startAddress(),
178 indexEntry.decompressedBlockSize, decompressed ) )
179 {
180 return false;
181 }
182
183 result.resize( indexEntry.recordSize );
184 memcpy( &result.front(), decompressed.constData() + indexEntry.recordOffset, indexEntry.recordSize );
185 return true;
186 }
187
188 };
189
190 class MdxDictionary: public BtreeIndexing::BtreeDictionary
191 {
192 Mutex idxMutex;
193 File::Class idx;
194 IdxHeader idxHeader;
195 string dictionaryName;
196 string encoding;
197 ChunkedStorage::Reader chunks;
198 QFile dictFile;
199 vector< sptr< IndexedMdd > > mddResources;
200 MdictParser::StyleSheets styleSheets;
201
202 QAtomicInt deferredInitDone;
203 Mutex deferredInitMutex;
204 bool deferredInitRunnableStarted;
205 QSemaphore deferredInitRunnableExited;
206
207 string initError;
208
209 public:
210
211 MdxDictionary( string const & id, string const & indexFile, vector<string> const & dictionaryFiles );
212
213 ~MdxDictionary();
214
215 virtual void deferredInit();
216
getName()217 virtual string getName() throw()
218 {
219 return dictionaryName;
220 }
221
getProperties()222 virtual map< Dictionary::Property, string > getProperties() throw()
223 {
224 return map< Dictionary::Property, string >();
225 }
226
getArticleCount()227 virtual unsigned long getArticleCount() throw()
228 {
229 return idxHeader.articleCount;
230 }
231
getWordCount()232 virtual unsigned long getWordCount() throw()
233 {
234 return idxHeader.wordCount;
235 }
236
getLangFrom() const237 inline virtual quint32 getLangFrom() const
238 {
239 return idxHeader.langFrom;
240 }
241
getLangTo() const242 inline virtual quint32 getLangTo() const
243 {
244 return idxHeader.langTo;
245 }
246
247 virtual sptr< Dictionary::DataRequest > getArticle( wstring const & word,
248 vector< wstring > const & alts,
249 wstring const &,
250 bool ignoreDiacritics ) THROW_SPEC( std::exception );
251 virtual sptr< Dictionary::DataRequest > getResource( string const & name ) THROW_SPEC( std::exception );
252 virtual QString const & getDescription();
253
254 virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
255 int searchMode, bool matchCase,
256 int distanceBetweenWords,
257 int maxResults,
258 bool ignoreWordsOrder,
259 bool ignoreDiacritics );
260 virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
261
262 virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration );
263
setFTSParameters(Config::FullTextSearch const & fts)264 virtual void setFTSParameters( Config::FullTextSearch const & fts )
265 {
266 if( ensureInitDone().size() )
267 return;
268
269 can_FTS = fts.enabled
270 && !fts.disabledTypes.contains( "MDICT", Qt::CaseInsensitive )
271 && ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
272 }
273 protected:
274
275 virtual void loadIcon() throw();
276
277 private:
278
279 virtual string const & ensureInitDone();
280 void doDeferredInit();
281
282 /// Loads an article with the given offset, filling the given strings.
283 void loadArticle( uint32_t offset, string & articleText, bool noFilter = false );
284
285 /// Process resource links (images, audios, etc)
286 QString & filterResource( QString const & articleId, QString & article );
287
288 friend class MdxHeadwordsRequest;
289 friend class MdxArticleRequest;
290 friend class MddResourceRequest;
291 friend class MdxDeferredInitRunnable;
292 };
293
MdxDictionary(string const & id,string const & indexFile,vector<string> const & dictionaryFiles)294 MdxDictionary::MdxDictionary( string const & id, string const & indexFile,
295 vector<string> const & dictionaryFiles ):
296 BtreeDictionary( id, dictionaryFiles ),
297 idx( indexFile, "rb" ),
298 idxHeader( idx.read< IdxHeader >() ),
299 chunks( idx, idxHeader.chunksOffset ),
300 deferredInitRunnableStarted( false )
301 {
302 // Read the dictionary's name
303 idx.seek( sizeof( idxHeader ) );
304 size_t len = idx.read< uint32_t >();
305 vector< char > buf( len );
306 if( len > 0 )
307 {
308 idx.read( &buf.front(), len );
309 dictionaryName = string( &buf.front(), len );
310 }
311
312 // then read the dictionary's encoding
313 len = idx.read< uint32_t >();
314 if( len > 0 )
315 {
316 buf.resize( len );
317 idx.read( &buf.front(), len );
318 encoding = string( &buf.front(), len );
319 }
320
321 dictFile.setFileName( QString::fromUtf8( dictionaryFiles[ 0 ].c_str() ) );
322 dictFile.open( QIODevice::ReadOnly );
323
324 // Full-text search parameters
325
326 can_FTS = true;
327
328 ftsIdxName = indexFile + "_FTS";
329
330 if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
331 && !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
332 FTS_index_completed.ref();
333 }
334
~MdxDictionary()335 MdxDictionary::~MdxDictionary()
336 {
337 Mutex::Lock _( deferredInitMutex );
338
339 // Wait for init runnable to complete if it was ever started
340 if ( deferredInitRunnableStarted )
341 deferredInitRunnableExited.acquire();
342
343 dictFile.close();
344 }
345
346 //////// MdxDictionary::deferredInit()
347
348 class MdxDeferredInitRunnable: public QRunnable
349 {
350 MdxDictionary & dictionary;
351 QSemaphore & hasExited;
352
353 public:
354
MdxDeferredInitRunnable(MdxDictionary & dictionary_,QSemaphore & hasExited_)355 MdxDeferredInitRunnable( MdxDictionary & dictionary_,
356 QSemaphore & hasExited_ ):
357 dictionary( dictionary_ ), hasExited( hasExited_ )
358 {}
359
~MdxDeferredInitRunnable()360 ~MdxDeferredInitRunnable()
361 {
362 hasExited.release();
363 }
364
run()365 virtual void run()
366 {
367 dictionary.doDeferredInit();
368 }
369 };
370
deferredInit()371 void MdxDictionary::deferredInit()
372 {
373 if ( !Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
374 {
375 Mutex::Lock _( deferredInitMutex );
376
377 if ( Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
378 return;
379
380 if ( !deferredInitRunnableStarted )
381 {
382 QThreadPool::globalInstance()->start(
383 new MdxDeferredInitRunnable( *this, deferredInitRunnableExited ),
384 -1000 );
385 deferredInitRunnableStarted = true;
386 }
387 }
388 }
389
ensureInitDone()390 string const & MdxDictionary::ensureInitDone()
391 {
392 doDeferredInit();
393 return initError;
394 }
395
doDeferredInit()396 void MdxDictionary::doDeferredInit()
397 {
398 if ( !Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
399 {
400 Mutex::Lock _( deferredInitMutex );
401
402 if ( Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
403 return;
404
405 // Do deferred init
406
407 try
408 {
409 // Retrieve stylesheets
410 idx.seek( idxHeader.styleSheetAddress );
411 for ( uint32_t i = 0; i < idxHeader.styleSheetCount; i++ )
412 {
413 qint32 key = idx.read< qint32 >();
414 vector< char > buf;
415 quint32 sz;
416
417 sz = idx.read< quint32 >();
418 buf.resize( sz );
419 idx.read( &buf.front(), sz );
420 QString styleBegin = QString::fromUtf8( buf.data() );
421
422 sz = idx.read< quint32 >();
423 buf.resize( sz );
424 idx.read( &buf.front(), sz );
425 QString styleEnd = QString::fromUtf8( buf.data() );
426
427 styleSheets[ key ] = pair<QString, QString>( styleBegin, styleEnd );
428 }
429
430 // Initialize the index
431 openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
432 idxHeader.indexRootOffset ), idx, idxMutex );
433
434 vector< string > mddFileNames;
435 vector< IndexInfo > mddIndexInfos;
436 idx.seek( idxHeader.mddIndexInfosOffset );
437 for ( uint32_t i = 0; i < idxHeader.mddIndexInfosCount; i++ )
438 {
439 quint32 sz = idx.read< quint32 >();
440 vector< char > buf( sz );
441 idx.read( &buf.front(), sz );
442 uint32_t btreeMaxElements = idx.read<uint32_t>();
443 uint32_t rootOffset = idx.read<uint32_t>();
444 mddFileNames.push_back( string( &buf.front() ) );
445 mddIndexInfos.push_back( IndexInfo( btreeMaxElements, rootOffset ) );
446 }
447
448 vector< string > const dictFiles = getDictionaryFilenames();
449 for ( uint32_t i = 1; i < dictFiles.size() && i < mddFileNames.size() + 1; i++ )
450 {
451 QFileInfo fi( QString::fromUtf8( dictFiles[ i ].c_str() ) );
452 QString mddFileName = QString::fromUtf8( mddFileNames[ i - 1 ].c_str() );
453
454 if ( fi.fileName() != mddFileName || !fi.exists() )
455 continue;
456
457 sptr< IndexedMdd > mdd = new IndexedMdd( idxMutex, chunks );
458 mdd->openIndex( mddIndexInfos[ i - 1 ], idx, idxMutex );
459 mdd->open( dictFiles[ i ].c_str() );
460 mddResources.push_back( mdd );
461 }
462 }
463 catch ( std::exception & e )
464 {
465 initError = e.what();
466 }
467 catch ( ... )
468 {
469 initError = "Unknown error";
470 }
471
472 deferredInitDone.ref();
473 }
474 }
475
makeFTSIndex(QAtomicInt & isCancelled,bool firstIteration)476 void MdxDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
477 {
478 if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
479 || FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
480 FTS_index_completed.ref();
481
482 if( haveFTSIndex() )
483 return;
484
485 if( ensureInitDone().size() )
486 return;
487
488 if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
489 return;
490
491 gdDebug( "MDict: Building the full-text index for dictionary: %s\n",
492 getName().c_str() );
493
494 try
495 {
496 FtsHelpers::makeFTSIndex( this, isCancelled );
497 FTS_index_completed.ref();
498 }
499 catch( std::exception &ex )
500 {
501 gdWarning( "MDict: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
502 QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
503 }
504 }
505
getArticleText(uint32_t articleAddress,QString & headword,QString & text)506 void MdxDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
507 {
508 try
509 {
510 headword.clear();
511 string articleText;
512
513 loadArticle( articleAddress, articleText, true );
514 text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) );
515 }
516 catch( std::exception &ex )
517 {
518 gdWarning( "MDict: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
519 }
520 }
521
getSearchResults(QString const & searchString,int searchMode,bool matchCase,int distanceBetweenWords,int maxResults,bool ignoreWordsOrder,bool ignoreDiacritics)522 sptr< Dictionary::DataRequest > MdxDictionary::getSearchResults( QString const & searchString,
523 int searchMode, bool matchCase,
524 int distanceBetweenWords,
525 int maxResults,
526 bool ignoreWordsOrder,
527 bool ignoreDiacritics )
528 {
529 return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics );
530 }
531
532 /// MdxDictionary::getArticle
533
534 class MdxArticleRequest;
535
536 class MdxArticleRequestRunnable: public QRunnable
537 {
538 MdxArticleRequest & r;
539 QSemaphore & hasExited;
540
541 public:
542
MdxArticleRequestRunnable(MdxArticleRequest & r_,QSemaphore & hasExited_)543 MdxArticleRequestRunnable( MdxArticleRequest & r_,
544 QSemaphore & hasExited_ ):
545 r( r_ ),
546 hasExited( hasExited_ )
547 {}
548
~MdxArticleRequestRunnable()549 ~MdxArticleRequestRunnable()
550 {
551 hasExited.release();
552 }
553
554 virtual void run();
555 };
556
557 class MdxArticleRequest: public Dictionary::DataRequest
558 {
559 friend class MdxArticleRequestRunnable;
560
561 wstring word;
562 vector< wstring > alts;
563 MdxDictionary & dict;
564 bool ignoreDiacritics;
565
566 QAtomicInt isCancelled;
567 QSemaphore hasExited;
568
569 public:
570
MdxArticleRequest(wstring const & word_,vector<wstring> const & alts_,MdxDictionary & dict_,bool ignoreDiacritics_)571 MdxArticleRequest( wstring const & word_,
572 vector< wstring > const & alts_,
573 MdxDictionary & dict_,
574 bool ignoreDiacritics_ ):
575 word( word_ ),
576 alts( alts_ ),
577 dict( dict_ ),
578 ignoreDiacritics( ignoreDiacritics_ )
579 {
580 QThreadPool::globalInstance()->start( new MdxArticleRequestRunnable( *this, hasExited ) );
581 }
582
583 void run();
584
cancel()585 virtual void cancel()
586 {
587 isCancelled.ref();
588 }
589
~MdxArticleRequest()590 ~MdxArticleRequest()
591 {
592 isCancelled.ref();
593 hasExited.acquire();
594 }
595 };
596
run()597 void MdxArticleRequestRunnable::run()
598 {
599 r.run();
600 }
601
run()602 void MdxArticleRequest::run()
603 {
604 if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
605 {
606 finish();
607 return;
608 }
609
610 if ( dict.ensureInitDone().size() )
611 {
612 setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
613 finish();
614 return;
615 }
616
617 vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
618
619 for ( unsigned x = 0; x < alts.size(); ++x )
620 {
621 /// Make an additional query for each alt
622 vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
623 chain.insert( chain.end(), altChain.begin(), altChain.end() );
624 }
625
626 // Some synonims make it that the articles appear several times. We combat this
627 // by only allowing them to appear once.
628 set< uint32_t > articlesIncluded;
629 // Sometimes the articles are physically duplicated. We store hashes of
630 // the bodies to account for this.
631 set< QByteArray > articleBodiesIncluded;
632 string articleText;
633
634 for ( unsigned x = 0; x < chain.size(); ++x )
635 {
636 if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
637 {
638 finish();
639 return;
640 }
641
642 if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
643 continue; // We already have this article in the body.
644
645 // Grab that article
646 string articleBody;
647 bool hasError = false;
648 QString errorMessage;
649
650 try
651 {
652 dict.loadArticle( chain[ x ].articleOffset, articleBody );
653 }
654 catch ( exCorruptDictionary & )
655 {
656 errorMessage = tr( "Dictionary file was tampered or corrupted" );
657 hasError = true;
658 }
659 catch ( std::exception & e )
660 {
661 errorMessage = e.what();
662 hasError = true;
663 }
664
665 if ( hasError )
666 {
667 setErrorString( tr( "Failed loading article from %1, reason: %2" )
668 .arg( QString::fromUtf8( dict.getDictionaryFilenames()[ 0 ].c_str() ) )
669 .arg( errorMessage ) );
670 finish();
671 return;
672 }
673
674 if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
675 continue; // We already have this article in the body.
676
677 QCryptographicHash hash( QCryptographicHash::Md5 );
678 hash.addData( articleBody.data(), articleBody.size() );
679 if ( !articleBodiesIncluded.insert( hash.result() ).second )
680 continue; // Already had this body
681
682 // Handle internal redirects
683 if ( strncmp( articleBody.c_str(), "@@@LINK=", 8 ) == 0 )
684 {
685 wstring target = Utf8::decode( articleBody.c_str() + 8 );
686 target = Folding::trimWhitespace( target );
687 // Make an additional query for this redirection
688 vector< WordArticleLink > altChain = dict.findArticles( target );
689 chain.insert( chain.end(), altChain.begin(), altChain.end() );
690 continue;
691 }
692
693 // See Issue #271: A mechanism to clean-up invalid HTML cards.
694 string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
695 "</font>""</font>""</font>""</font>""</font>""</font>"
696 "</b></b></b></b></b></b></b></b>"
697 "</i></i></i></i></i></i></i></i>"
698 "</a></a></a></a></a></a></a></a>";
699 articleText += "<div class=\"mdict\">" + articleBody + cleaner + "</div>\n";
700 }
701
702 if ( !articleText.empty() )
703 {
704 Mutex::Lock _( dataMutex );
705 data.insert( data.end(), articleText.begin(), articleText.end() );
706 hasAnyData = true;
707 }
708
709 finish();
710 }
711
getArticle(const wstring & word,const vector<wstring> & alts,const wstring &,bool ignoreDiacritics)712 sptr<Dictionary::DataRequest> MdxDictionary::getArticle( const wstring & word, const vector<wstring> & alts,
713 const wstring &, bool ignoreDiacritics ) THROW_SPEC( std::exception )
714 {
715 return new MdxArticleRequest( word, alts, *this, ignoreDiacritics );
716 }
717
718 /// MdxDictionary::getResource
719
720 class MddResourceRequest;
721
722 class MddResourceRequestRunnable: public QRunnable
723 {
724 MddResourceRequest & r;
725 QSemaphore & hasExited;
726
727 public:
728
MddResourceRequestRunnable(MddResourceRequest & r_,QSemaphore & hasExited_)729 MddResourceRequestRunnable( MddResourceRequest & r_,
730 QSemaphore & hasExited_ ): r( r_ ),
731 hasExited( hasExited_ )
732 {}
733
~MddResourceRequestRunnable()734 ~MddResourceRequestRunnable()
735 {
736 hasExited.release();
737 }
738
739 virtual void run();
740 };
741
742 class MddResourceRequest: public Dictionary::DataRequest
743 {
744 friend class MddResourceRequestRunnable;
745
746 MdxDictionary & dict;
747 wstring resourceName;
748 QAtomicInt isCancelled;
749 QSemaphore hasExited;
750
751 public:
752
MddResourceRequest(MdxDictionary & dict_,string const & resourceName_)753 MddResourceRequest( MdxDictionary & dict_,
754 string const & resourceName_ ):
755 dict( dict_ ),
756 resourceName( Utf8::decode( resourceName_ ) )
757 {
758 QThreadPool::globalInstance()->start( new MddResourceRequestRunnable( *this, hasExited ) );
759 }
760
761 void run(); // Run from another thread by MddResourceRequestRunnable
762
cancel()763 virtual void cancel()
764 {
765 isCancelled.ref();
766 }
767
~MddResourceRequest()768 ~MddResourceRequest()
769 {
770 isCancelled.ref();
771 hasExited.acquire();
772 }
773 };
774
run()775 void MddResourceRequestRunnable::run()
776 {
777 r.run();
778 }
779
run()780 void MddResourceRequest::run()
781 {
782 if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
783 {
784 finish();
785 return;
786 }
787
788 if ( dict.ensureInitDone().size() )
789 {
790 setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
791 finish();
792 return;
793 }
794
795 // In order to prevent recursive internal redirection...
796 set< QByteArray > resourceIncluded;
797
798 for ( ;; )
799 {
800 // Some runnables linger enough that they are cancelled before they start
801 if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
802 {
803 finish();
804 return;
805 }
806
807 string u8ResourceName = Utf8::encode( resourceName );
808 QCryptographicHash hash( QCryptographicHash::Md5 );
809 hash.addData( u8ResourceName.data(), u8ResourceName.size() );
810 if ( !resourceIncluded.insert( hash.result() ).second )
811 continue;
812
813 // Convert to the Windows separator
814 std::replace( resourceName.begin(), resourceName.end(), '/', '\\' );
815 if ( resourceName[ 0 ] != '\\' )
816 {
817 resourceName.insert( 0, 1, '\\' );
818 }
819
820 Mutex::Lock _( dataMutex );
821 data.clear();
822
823 try
824 {
825 // local file takes precedence
826 string fn = FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) +
827 FsEncoding::separator() + u8ResourceName;
828 File::loadFromFile( fn, data );
829 }
830 catch ( File::exCantOpen & )
831 {
832 for ( vector< sptr< IndexedMdd > >::const_iterator i = dict.mddResources.begin();
833 i != dict.mddResources.end(); i++ )
834 {
835 sptr< IndexedMdd > mddResource = *i;
836 if ( mddResource->loadFile( resourceName, data ) )
837 break;
838 }
839 }
840
841 // Check if this file has a redirection
842 // Always encoded in UTF16-LE
843 // L"@@@LINK="
844 static const char pattern[16] =
845 {
846 '@', '\0', '@', '\0', '@', '\0', 'L', '\0', 'I', '\0', 'N', '\0', 'K', '\0', '=', '\0'
847 };
848
849 if ( data.size() > sizeof( pattern ) )
850 {
851 if ( memcmp( &data.front(), pattern, sizeof( pattern ) ) == 0 )
852 {
853 data.push_back( '\0' );
854 data.push_back( '\0' );
855 QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
856 data.size() - sizeof( pattern ) );
857 resourceName = gd::toWString( target.trimmed() );
858 continue;
859 }
860 }
861
862 if ( data.size() > 0 )
863 {
864 hasAnyData = true;
865
866 if ( Filetype::isNameOfCSS( u8ResourceName ) )
867 {
868 QString css = QString::fromUtf8( data.data(), data.size() );
869
870 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
871 QRegularExpression links( "url\\(\\s*(['\"]?)([^'\"]*)(['\"]?)\\s*\\)",
872 QRegularExpression::CaseInsensitiveOption );
873 #else
874 QRegExp links( "url\\(\\s*(['\"]?)([^'\"]*)(['\"]?)\\s*\\)", Qt::CaseInsensitive, QRegExp::RegExp );
875 #endif
876 QString id = QString::fromUtf8( dict.getId().c_str() );
877 int pos = 0;
878
879 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
880 QString newCSS;
881 QRegularExpressionMatchIterator it = links.globalMatch( css );
882 while ( it.hasNext() )
883 {
884 QRegularExpressionMatch match = it.next();
885 newCSS += css.midRef( pos, match.capturedStart() - pos );
886 pos = match.capturedEnd();
887 QString url = match.captured( 2 );
888 #else
889 for( ; ; )
890 {
891 pos = links.indexIn( css, pos );
892 if( pos < 0 )
893 break;
894 QString url = links.cap( 2 );
895 #endif
896
897 if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0)
898 {
899 // External link or base64-encoded data
900 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
901 newCSS += match.captured();
902 #else
903 pos += links.cap().size();
904 #endif
905 continue;
906 }
907
908 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
909 QString newUrl = QString( "url(" ) + match.captured( 1 ) + "bres://"
910 + id + "/" + url + match.captured( 3 ) + ")";
911 newCSS += newUrl;
912 #else
913 QString newUrl = QString( "url(" ) + links.cap( 1 ) + "bres://"
914 + id + "/" + url + links.cap( 3 ) + ")";
915 css.replace( pos, links.cap().size(), newUrl );
916 pos += newUrl.size();
917 #endif
918 }
919 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
920 if( pos )
921 {
922 newCSS += css.midRef( pos );
923 css = newCSS;
924 newCSS.clear();
925 }
926 #endif
927 dict.isolateCSS( css, ".mdict" );
928 QByteArray bytes = css.toUtf8();
929 data.resize( bytes.size() );
930 memcpy( &data.front(), bytes.constData(), bytes.size() );
931 }
932 }
933 break;
934 }
935
936 finish();
937 }
938
939 sptr<Dictionary::DataRequest> MdxDictionary::getResource( const string & name ) THROW_SPEC( std::exception )
940 {
941 return new MddResourceRequest( *this, name );
942 }
943
944 const QString & MdxDictionary::getDescription()
945 {
946 if ( !dictionaryDescription.isEmpty() )
947 return dictionaryDescription;
948
949 if ( idxHeader.descriptionSize == 0 )
950 {
951 dictionaryDescription = "NONE";
952 }
953 else
954 {
955 Mutex::Lock _( idxMutex );
956 vector< char > chunk;
957 char * dictDescription = chunks.getBlock( idxHeader.descriptionAddress, chunk );
958 string str( dictDescription );
959 dictionaryDescription = QString::fromUtf8( str.c_str(), str.size() );
960 }
961
962 return dictionaryDescription;
963 }
964
965 void MdxDictionary::loadIcon() throw()
966 {
967 if ( dictionaryIconLoaded )
968 return;
969
970 QString fileName =
971 QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
972
973 // Remove the extension
974 fileName.chop( 3 );
975
976 if ( !loadIconFromFile( fileName ) )
977 {
978 // Use default icons
979 dictionaryIcon = dictionaryNativeIcon = QIcon( ":/icons/mdict.png" );
980 }
981
982 dictionaryIconLoaded = true;
983 }
984
985 void MdxDictionary::loadArticle( uint32_t offset, string & articleText, bool noFilter )
986 {
987 vector< char > chunk;
988 Mutex::Lock _( idxMutex );
989
990 // Load record info from index
991 MdictParser::RecordInfo recordInfo;
992 char * pRecordInfo = chunks.getBlock( offset, chunk );
993 memcpy( &recordInfo, pRecordInfo, sizeof( recordInfo ) );
994
995 // Make a sub unique id for this article
996 QString articleId;
997 articleId.setNum( ( quint64 )pRecordInfo, 16 );
998
999 ScopedMemMap compressed( dictFile, recordInfo.compressedBlockPos, recordInfo.compressedBlockSize );
1000 if ( !compressed.startAddress() )
1001 throw exCorruptDictionary();
1002
1003 QByteArray decompressed;
1004 if ( !MdictParser::parseCompressedBlock( recordInfo.compressedBlockSize, ( char * )compressed.startAddress(),
1005 recordInfo.decompressedBlockSize, decompressed ) )
1006 throw exCorruptDictionary();
1007
1008 QString article = MdictParser::toUtf16( encoding.c_str(),
1009 decompressed.constData() + recordInfo.recordOffset,
1010 recordInfo.recordSize );
1011
1012 article = MdictParser::substituteStylesheet( article, styleSheets );
1013
1014 if( !noFilter )
1015 article = filterResource( articleId, article );
1016
1017 // Check for unclosed <span> and <div>
1018
1019 int openTags = article.count( QRegExp( "<\\s*span\\b", Qt::CaseInsensitive ) );
1020 int closedTags = article.count( QRegExp( "<\\s*/span\\s*>", Qt::CaseInsensitive ) );
1021 while( openTags > closedTags )
1022 {
1023 article += "</span>";
1024 closedTags += 1;
1025 }
1026
1027 openTags = article.count( QRegExp( "<\\s*div\\b", Qt::CaseInsensitive ) );
1028 closedTags = article.count( QRegExp( "<\\s*/div\\s*>", Qt::CaseInsensitive ) );
1029 while( openTags > closedTags )
1030 {
1031 article += "</div>";
1032 closedTags += 1;
1033 }
1034
1035 articleText = string( article.toUtf8().constData() );
1036 }
1037
1038 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
1039 QString & MdxDictionary::filterResource( QString const & articleId, QString & article )
1040 {
1041 QString id = QString::fromStdString( getId() );
1042 QString uniquePrefix = QString::fromLatin1( "g" ) + id + "_" + articleId + "_";
1043
1044 QRegularExpression allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script)(?:\\s+[^>]+|\\s*)>)",
1045 QRegularExpression::CaseInsensitiveOption );
1046 QRegularExpression wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*?)((?:#[^>]*?)?)\\2",
1047 QRegularExpression::CaseInsensitiveOption );
1048 QRegularExpression anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)",
1049 QRegularExpression::CaseInsensitiveOption );
1050 QRegularExpression anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)",
1051 QRegularExpression::CaseInsensitiveOption );
1052 QRegularExpression anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#",
1053 QRegularExpression::CaseInsensitiveOption );
1054 QRegularExpression audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2",
1055 QRegularExpression::CaseInsensitiveOption
1056 | QRegularExpression::InvertedGreedinessOption );
1057 QRegularExpression stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
1058 QRegularExpression::CaseInsensitiveOption );
1059 QRegularExpression stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
1060 QRegularExpression::CaseInsensitiveOption );
1061 QRegularExpression inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>",
1062 QRegularExpression::CaseInsensitiveOption );
1063 QRegularExpression closeScriptTagRe( "<\\s*/script\\s*>",
1064 QRegularExpression::CaseInsensitiveOption );
1065 QRegularExpression srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
1066 QRegularExpression::CaseInsensitiveOption );
1067 QRegularExpression srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
1068 QRegularExpression::CaseInsensitiveOption );
1069
1070 QString articleNewText;
1071 int linkPos = 0;
1072 QRegularExpressionMatchIterator it = allLinksRe.globalMatch( article );
1073 while( it.hasNext() )
1074 {
1075 QRegularExpressionMatch allLinksMatch = it.next();
1076
1077 if( allLinksMatch.capturedEnd() < linkPos )
1078 continue;
1079
1080 articleNewText += article.midRef( linkPos, allLinksMatch.capturedStart() - linkPos );
1081 linkPos = allLinksMatch.capturedEnd();
1082
1083 QString linkTxt = allLinksMatch.captured();
1084 QString linkType = allLinksMatch.captured( 1 ).toLower();
1085 QString newLink;
1086
1087 if( !linkType.isEmpty() && linkType.at( 0 ) == 'a' )
1088 {
1089 QRegularExpressionMatch match = anchorIdRe.match( linkTxt );
1090 if( match.hasMatch() )
1091 {
1092 QString newText = match.captured( 1 ) + match.captured( 2 ) + uniquePrefix;
1093 newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
1094 }
1095 else
1096 newLink = linkTxt.replace( anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" );
1097
1098 newLink = newLink.replace( anchorLinkRe, "\\1#" + uniquePrefix );
1099
1100 match = audioRe.match( newLink );
1101 if( match.hasMatch() )
1102 {
1103 // sounds and audio link script
1104 QString newTxt = match.captured( 1 ) + match.captured( 2 )
1105 + "gdau://" + id + "/"
1106 + match.captured( 3 ) + match.captured( 2 );
1107 newLink = QString::fromUtf8( addAudioLink( "\"gdau://" + getId() + "/" + match.captured( 3 ).toUtf8().data() + "\"", getId() ).c_str() )
1108 + newLink.replace( match.capturedStart(), match.capturedLength(), newTxt );
1109 }
1110
1111 match = wordCrossLink.match( newLink );
1112 if( match.hasMatch() )
1113 {
1114 QString newTxt = match.captured( 1 ) + match.captured( 2 )
1115 + "gdlookup://localhost/"
1116 + match.captured( 3 );
1117
1118 if( match.lastCapturedIndex() >= 4 && !match.captured( 4 ).isEmpty() )
1119 newTxt += QString( "?gdanchor=" ) + uniquePrefix + match.captured( 4 ).mid( 1 );
1120
1121 newTxt += match.captured( 2 );
1122 newLink.replace( match.capturedStart(), match.capturedLength(), newTxt );
1123 }
1124 }
1125 else
1126 if( linkType.compare( "link" ) == 0 )
1127 {
1128 // stylesheets
1129 QRegularExpressionMatch match = stylesRe.match( linkTxt );
1130 if( match.hasMatch() )
1131 {
1132 QString newText = match.captured( 1 ) + match.captured( 2 )
1133 + "bres://" + id + "/"
1134 + match.captured( 3 ) + match.captured( 2 );
1135 newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
1136 }
1137 else
1138 newLink = linkTxt.replace( stylesRe2,
1139 "\\1\"bres://" + id + "/\\2\"" );
1140 }
1141 else
1142 if( linkType.compare( "script" ) == 0 || linkType.compare( "img" ) == 0 )
1143 {
1144 // javascripts and images
1145 QRegularExpressionMatch match = inlineScriptRe.match( linkTxt );
1146 if( linkType.at( 0 ) == 's'
1147 && match.hasMatch() && match.capturedLength() == linkTxt.length() )
1148 {
1149 // skip inline scripts
1150 articleNewText += linkTxt;
1151 match = closeScriptTagRe.match( article, linkPos );
1152 if( match.hasMatch() )
1153 {
1154 articleNewText += article.midRef( linkPos, match.capturedEnd() - linkPos );
1155 linkPos = match.capturedEnd();
1156 }
1157 continue;
1158 }
1159 else
1160 {
1161 match = srcRe.match( linkTxt );
1162 if( match.hasMatch() )
1163 {
1164 QString newText = match.captured( 1 ) + match.captured( 2 )
1165 + "bres://" + id + "/"
1166 + match.captured( 3 ) + match.captured( 2 );
1167 newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
1168 }
1169 else
1170 newLink = linkTxt.replace( srcRe2,
1171 "\\1\"bres://" + id + "/\\2\"" );
1172 }
1173 }
1174 if( !newLink.isEmpty() )
1175 {
1176 articleNewText += newLink;
1177 }
1178 else
1179 articleNewText += allLinksMatch.captured();
1180 }
1181 if( linkPos )
1182 {
1183 articleNewText += article.midRef( linkPos );
1184 article = articleNewText;
1185 }
1186
1187 return article;
1188 }
1189 #else
1190 QString & MdxDictionary::filterResource( QString const & articleId, QString & article )
1191 {
1192 QString id = QString::fromStdString( getId() );
1193 QString uniquePrefix = QString::fromLatin1( "g" ) + id + "_" + articleId + "_";
1194
1195 QRegExp allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script)(?:\\s+[^>]+|\\s*)>)", Qt::CaseInsensitive );
1196 QRegExp wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*)((?:#[^>]*)?)\\2", Qt::CaseInsensitive );
1197 wordCrossLink.setMinimal( true );
1198
1199 QRegExp anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)", Qt::CaseInsensitive );
1200 QRegExp anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)", Qt::CaseInsensitive );
1201 QRegExp anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#", Qt::CaseInsensitive );
1202 QRegExp audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2", Qt::CaseInsensitive );
1203 audioRe.setMinimal( true );
1204
1205 QRegExp stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
1206 Qt::CaseInsensitive, QRegExp::RegExp2 );
1207 stylesRe.setMinimal( true );
1208 QRegExp stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
1209 Qt::CaseInsensitive, QRegExp::RegExp2 );
1210 QRegExp inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>", Qt::CaseInsensitive, QRegExp::RegExp2 );
1211 QRegExp closeScriptTagRe( "<\\s*/script\\s*>", Qt::CaseInsensitive, QRegExp::RegExp2 );
1212 QRegExp srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
1213 Qt::CaseInsensitive, QRegExp::RegExp2 );
1214 srcRe.setMinimal( true );
1215 QRegExp srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
1216 Qt::CaseInsensitive, QRegExp::RegExp2 );
1217
1218 int linkPos = 0;
1219 while( linkPos >= 0 )
1220 {
1221 linkPos = allLinksRe.indexIn( article, linkPos );
1222 if( linkPos < 0 )
1223 break;
1224
1225 QString linkTxt = allLinksRe.cap( 0 );
1226 QString linkType = allLinksRe.cap( 1 ).toLower();
1227 QString newLink;
1228
1229 if( !linkType.isEmpty() && linkType.at( 0 ) == 'a' )
1230 {
1231 int pos = anchorIdRe.indexIn( linkTxt );
1232 if( pos >= 0 )
1233 {
1234 QString newText = anchorIdRe.cap( 1 ) + anchorIdRe.cap( 2 ) + uniquePrefix;
1235 newLink = linkTxt.replace( pos, anchorIdRe.cap().length(), newText );
1236 }
1237 else
1238 newLink = linkTxt.replace( anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" );
1239
1240 newLink = newLink.replace( anchorLinkRe, "\\1#" + uniquePrefix );
1241
1242 pos = audioRe.indexIn( newLink );
1243 if( pos >= 0 )
1244 {
1245 // sounds and audio link script
1246 QString newTxt = audioRe.cap( 1 ) + audioRe.cap( 2 )
1247 + "gdau://" + id + "/"
1248 + audioRe.cap( 3 ) + audioRe.cap( 2 );
1249 newLink = QString::fromUtf8( addAudioLink( "\"gdau://" + getId() + "/" + audioRe.cap( 3 ).toUtf8().data() + "\"", getId() ).c_str() )
1250 + newLink.replace( pos, audioRe.cap().length(), newTxt );
1251 }
1252
1253 pos = wordCrossLink.indexIn( newLink );
1254 if( pos >= 0 )
1255 {
1256 QString newTxt = wordCrossLink.cap( 1 ) + wordCrossLink.cap( 2 )
1257 + "gdlookup://localhost/"
1258 + wordCrossLink.cap( 3 );
1259
1260 if( !wordCrossLink.cap( 4 ).isEmpty() )
1261 newTxt += QString( "?gdanchor=" ) + uniquePrefix + wordCrossLink.cap( 4 ).mid( 1 );
1262
1263 newTxt += wordCrossLink.cap( 2 );
1264 newLink.replace( pos, wordCrossLink.cap( 0 ).length(), newTxt );
1265 }
1266 }
1267 else
1268 if( linkType.compare( "link" ) == 0 )
1269 {
1270 // stylesheets
1271 int pos = stylesRe.indexIn( linkTxt );
1272 if( pos >= 0 )
1273 {
1274 QString newText = stylesRe.cap( 1 ) + stylesRe.cap( 2 )
1275 + "bres://" + id + "/"
1276 + stylesRe.cap( 3 ) + stylesRe.cap( 2 );
1277 newLink = linkTxt.replace( pos, stylesRe.cap().length(), newText );
1278 }
1279 else
1280 newLink = linkTxt.replace( stylesRe2,
1281 "\\1\"bres://" + id + "/\\2\"" );
1282 }
1283 else
1284 if( linkType.compare( "script" ) == 0 || linkType.compare( "img" ) == 0 )
1285 {
1286 // javascripts and images
1287 if( linkType.at( 0 ) == 's' && inlineScriptRe.exactMatch( linkTxt ) )
1288 {
1289 // skip inline scripts
1290 linkPos += linkTxt.length();
1291 int pos = closeScriptTagRe.indexIn( article, linkPos );
1292 if( pos > 0 )
1293 linkPos = pos + closeScriptTagRe.cap().length();
1294 continue;
1295 }
1296 else
1297 {
1298 int pos = srcRe.indexIn( linkTxt );
1299 if( pos >= 0 )
1300 {
1301 QString newText = srcRe.cap( 1 ) + srcRe.cap( 2 )
1302 + "bres://" + id + "/"
1303 + srcRe.cap( 3 ) + srcRe.cap( 2 );
1304 newLink = linkTxt.replace( pos, srcRe.cap().length(), newText );
1305 }
1306 else
1307 newLink = linkTxt.replace( srcRe2,
1308 "\\1\"bres://" + id + "/\\2\"" );
1309 }
1310 }
1311 if( !newLink.isEmpty() )
1312 {
1313 article.replace( linkPos, allLinksRe.cap().length(), newLink );
1314 linkPos += newLink.length();
1315 }
1316 else
1317 linkPos += allLinksRe.cap().length();
1318 }
1319
1320 return article;
1321 }
1322 #endif
1323
1324 static void addEntryToIndex( QString const & word, uint32_t offset, IndexedWords & indexedWords )
1325 {
1326 // Strip any leading or trailing whitespaces
1327 QString wordTrimmed = word.trimmed();
1328 indexedWords.addWord( gd::toWString( wordTrimmed ), offset );
1329 }
1330
1331 static void addEntryToIndexSingle( QString const & word, uint32_t offset, IndexedWords & indexedWords )
1332 {
1333 // Strip any leading or trailing whitespaces
1334 QString wordTrimmed = word.trimmed();
1335 indexedWords.addSingleWord( gd::toWString( wordTrimmed ), offset );
1336 }
1337
1338 class ArticleHandler: public MdictParser::RecordHandler
1339 {
1340 public:
1341 ArticleHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ) :
1342 chunks( chunks ),
1343 indexedWords( indexedWords )
1344 {
1345 }
1346
1347 virtual void handleRecord( QString const & headWord, MdictParser::RecordInfo const & recordInfo )
1348 {
1349 // Save the article's record info
1350 uint32_t articleAddress = chunks.startNewBlock();
1351 chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
1352 // Add entries to the index
1353 addEntryToIndex( headWord, articleAddress, indexedWords );
1354 }
1355
1356 private:
1357 ChunkedStorage::Writer & chunks;
1358 IndexedWords & indexedWords;
1359 };
1360
1361 class ResourceHandler: public MdictParser::RecordHandler
1362 {
1363 public:
1364 ResourceHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ):
1365 chunks( chunks ),
1366 indexedWords( indexedWords )
1367 {
1368 }
1369
1370 virtual void handleRecord( QString const & fileName, MdictParser::RecordInfo const & recordInfo )
1371 {
1372 uint32_t resourceInfoAddress = chunks.startNewBlock();
1373 chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
1374 // Add entries to the index
1375 addEntryToIndexSingle( fileName, resourceInfoAddress, indexedWords );
1376 }
1377
1378 private:
1379 ChunkedStorage::Writer & chunks;
1380 IndexedWords & indexedWords;
1381 };
1382
1383
1384 static bool indexIsOldOrBad( vector< string > const & dictFiles, string const & indexFile )
1385 {
1386 File::Class idx( indexFile, "rb" );
1387 IdxHeader header;
1388
1389 return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
1390 header.signature != kSignature ||
1391 header.formatVersion != kCurrentFormatVersion ||
1392 header.parserVersion != MdictParser::kParserVersion ||
1393 header.foldingVersion != Folding::Version ||
1394 header.mddIndexInfosCount != dictFiles.size() - 1;
1395 }
1396
1397 static void findResourceFiles( string const & mdx, vector< string > & dictFiles )
1398 {
1399 string base( mdx, 0, mdx.size() - 4 );
1400 // Check if there' is any file end with .mdd, which is the resource file for the dictionary
1401 string resFile;
1402 if ( File::tryPossibleName( base + ".mdd", resFile ) )
1403 {
1404 dictFiles.push_back( resFile );
1405 // Find complementary .mdd file (volumes), like follows:
1406 // demo.mdx <- main dictionary file
1407 // demo.mdd <- main resource file ( 1st volume )
1408 // demo.1.mdd <- 2nd volume
1409 // ...
1410 // demo.n.mdd <- nth volume
1411 QString baseU8 = QString::fromUtf8( base.c_str() );
1412 int vol = 1;
1413 while ( File::tryPossibleName( string( QString( "%1.%2.mdd" ).arg( baseU8 ).arg( vol )
1414 .toUtf8().constBegin() ), resFile ) )
1415 {
1416 dictFiles.push_back( resFile );
1417 vol++;
1418 }
1419 }
1420 }
1421
1422 vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames,
1423 string const & indicesDir,
1424 Dictionary::Initializing & initializing ) THROW_SPEC( std::exception )
1425 {
1426 vector< sptr< Dictionary::Class > > dictionaries;
1427
1428 for ( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end(); i++ )
1429 {
1430 // Skip files with the extensions different to .mdx to speed up the
1431 // scanning
1432 if ( i->size() < 4 || strcasecmp( i->c_str() + ( i->size() - 4 ), ".mdx" ) != 0 )
1433 continue;
1434
1435 vector< string > dictFiles( 1, *i );
1436 findResourceFiles( *i, dictFiles );
1437
1438 string dictId = Dictionary::makeDictionaryId( dictFiles );
1439 string indexFile = indicesDir + dictId;
1440
1441 if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
1442 indexIsOldOrBad( dictFiles, indexFile ) )
1443 {
1444 // Building the index
1445
1446 gdDebug( "MDict: Building the index for dictionary: %s\n", i->c_str() );
1447
1448 MdictParser parser;
1449 list< sptr< MdictParser > > mddParsers;
1450
1451 if ( !parser.open( i->c_str() ) )
1452 continue;
1453
1454 string title = string( parser.title().toUtf8().constData() );
1455 initializing.indexingDictionary( title );
1456
1457 for ( vector< string >::const_iterator mddIter = dictFiles.begin() + 1;
1458 mddIter != dictFiles.end(); mddIter++ )
1459 {
1460 if ( File::exists( *mddIter ) )
1461 {
1462 sptr< MdictParser > mddParser = new MdictParser();
1463 if ( !mddParser->open( mddIter->c_str() ) )
1464 {
1465 gdWarning( "Broken mdd (resource) file: %s\n", mddIter->c_str() );
1466 continue;
1467 }
1468 mddParsers.push_back( mddParser );
1469 }
1470 }
1471
1472 File::Class idx( indexFile, "wb" );
1473 IdxHeader idxHeader;
1474 memset( &idxHeader, 0, sizeof( idxHeader ) );
1475 // We write a dummy header first. At the end of the process the header
1476 // will be rewritten with the right values.
1477 idx.write( idxHeader );
1478
1479 // Write the title first
1480 idx.write< uint32_t >( title.size() );
1481 idx.write( title.data(), title.size() );
1482
1483 // then the encoding
1484 {
1485 string encoding = string( parser.encoding().toUtf8().constData() );
1486 idx.write< uint32_t >( encoding.size() );
1487 idx.write( encoding.data(), encoding.size() );
1488 }
1489
1490 // This is our index data that we accumulate during the loading process.
1491 // For each new word encountered, we emit the article's body to the file
1492 // immediately, inserting the word itself and its offset in this map.
1493 // This map maps folded words to the original words and the corresponding
1494 // articles' offsets.
1495 IndexedWords indexedWords;
1496 ChunkedStorage::Writer chunks( idx );
1497
1498 idxHeader.isRightToLeft = parser.isRightToLeft();
1499
1500 // Save dictionary description if there's one
1501 {
1502 string description = string( parser.description().toUtf8().constData() );
1503 idxHeader.descriptionAddress = chunks.startNewBlock();
1504 chunks.addToBlock( description.c_str(), description.size() + 1 );
1505 idxHeader.descriptionSize = description.size() + 1;
1506 }
1507
1508 ArticleHandler articleHandler( chunks, indexedWords );
1509 MdictParser::HeadWordIndex headWordIndex;
1510
1511 // enumerating word and its definition
1512 while ( parser.readNextHeadWordIndex( headWordIndex ) )
1513 {
1514 parser.readRecordBlock( headWordIndex, articleHandler );
1515 }
1516
1517 // enumerating resources if there's any
1518 vector< sptr< IndexedWords > > mddIndices;
1519 vector< string > mddFileNames;
1520 while ( !mddParsers.empty() )
1521 {
1522 sptr< MdictParser > mddParser = mddParsers.front();
1523 sptr< IndexedWords > mddIndexedWords = new IndexedWords();
1524 MdictParser::HeadWordIndex resourcesIndex;
1525 ResourceHandler resourceHandler( chunks, *mddIndexedWords );
1526
1527 while ( mddParser->readNextHeadWordIndex( headWordIndex ) )
1528 {
1529 resourcesIndex.insert( resourcesIndex.end(), headWordIndex.begin(), headWordIndex.end() );
1530 }
1531 mddParser->readRecordBlock( resourcesIndex, resourceHandler );
1532
1533 mddIndices.push_back( mddIndexedWords );
1534 // Save filename for .mdd files only
1535 QFileInfo fi( mddParser->filename() );
1536 mddFileNames.push_back( string( fi.fileName().toUtf8().constData() ) );
1537 mddParsers.pop_front();
1538 }
1539
1540 // Finish with the chunks
1541 idxHeader.chunksOffset = chunks.finish();
1542
1543 GD_DPRINTF( "Writing index...\n" );
1544
1545 // Good. Now build the index
1546 IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
1547 idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
1548 idxHeader.indexRootOffset = idxInfo.rootOffset;
1549
1550 // Save dictionary stylesheets
1551 {
1552 MdictParser::StyleSheets const & styleSheets = parser.styleSheets();
1553 idxHeader.styleSheetAddress = idx.tell();
1554 idxHeader.styleSheetCount = styleSheets.size();
1555
1556 for ( MdictParser::StyleSheets::const_iterator iter = styleSheets.begin();
1557 iter != styleSheets.end(); iter++ )
1558 {
1559 string styleBegin( iter->second.first.toUtf8().constData() );
1560 string styleEnd( iter->second.second.toUtf8().constData() );
1561
1562 // key
1563 idx.write<qint32>( iter->first );
1564 // styleBegin
1565 idx.write<quint32>( ( quint32 )styleBegin.size() + 1 );
1566 idx.write( styleBegin.c_str(), styleBegin.size() + 1 );
1567 // styleEnd
1568 idx.write<quint32>( ( quint32 )styleEnd.size() + 1 );
1569 idx.write( styleEnd.c_str(), styleEnd.size() + 1 );
1570 }
1571 }
1572
1573 // read languages
1574 QPair<quint32, quint32> langs = LangCoder::findIdsForFilename( QString::fromStdString( *i ) );
1575
1576 // if no languages found, try dictionary's name
1577 if ( langs.first == 0 || langs.second == 0 )
1578 {
1579 langs = LangCoder::findIdsForFilename( parser.title() );
1580 }
1581
1582 idxHeader.langFrom = langs.first;
1583 idxHeader.langTo = langs.second;
1584
1585 // Build index info for each mdd file
1586 vector< IndexInfo > mddIndexInfos;
1587 for ( vector< sptr< IndexedWords > >::const_iterator mddIndexIter = mddIndices.begin();
1588 mddIndexIter != mddIndices.end(); mddIndexIter++ )
1589 {
1590 IndexInfo resourceIdxInfo = BtreeIndexing::buildIndex( *( *mddIndexIter ), idx );
1591 mddIndexInfos.push_back( resourceIdxInfo );
1592 }
1593
1594 // Save address of IndexInfos for resource files
1595 idxHeader.mddIndexInfosOffset = idx.tell();
1596 idxHeader.mddIndexInfosCount = mddIndexInfos.size();
1597 for ( uint32_t mi = 0; mi < mddIndexInfos.size(); mi++ )
1598 {
1599 const string & mddfile = mddFileNames[ mi ];
1600
1601 idx.write<quint32>( ( quint32 )mddfile.size() + 1 );
1602 idx.write( mddfile.c_str(), mddfile.size() + 1 );
1603 idx.write<uint32_t>( mddIndexInfos[ mi ].btreeMaxElements );
1604 idx.write<uint32_t>( mddIndexInfos[ mi ].rootOffset );
1605 }
1606
1607 // That concludes it. Update the header.
1608 idxHeader.signature = kSignature;
1609 idxHeader.formatVersion = kCurrentFormatVersion;
1610 idxHeader.parserVersion = MdictParser::kParserVersion;
1611 idxHeader.foldingVersion = Folding::Version;
1612 idxHeader.articleCount = parser.wordCount();
1613 idxHeader.wordCount = parser.wordCount();
1614
1615 idx.rewind();
1616 idx.write( &idxHeader, sizeof( idxHeader ) );
1617 }
1618
1619 dictionaries.push_back( new MdxDictionary( dictId, indexFile, dictFiles ) );
1620 }
1621
1622 return dictionaries;
1623 }
1624
1625 }
1626