1 /* This file is (c) 2013 Timon Wong <timon86.wang AT gmail DOT com>
2  * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
3 
4 #include "mdx.hh"
5 #include "btreeidx.hh"
6 #include "folding.hh"
7 #include "utf8.hh"
8 #include "file.hh"
9 #include "wstring.hh"
10 #include "wstring_qt.hh"
11 #include "chunkedstorage.hh"
12 #include "gddebug.hh"
13 #include "langcoder.hh"
14 #include "fsencoding.hh"
15 #include "audiolink.hh"
16 #include "ex.hh"
17 #include "mdictparser.hh"
18 #include "filetype.hh"
19 #include "ftshelpers.hh"
20 #include "htmlescape.hh"
21 
22 #include <algorithm>
23 #include <map>
24 #include <set>
25 #include <list>
26 #include <ctype.h>
27 #include <stdlib.h>
28 
29 #ifdef _MSC_VER
30 #include <stub_msvc.h>
31 #endif
32 
33 #include <QDir>
34 #include <QString>
35 #include <QSemaphore>
36 #include <QThreadPool>
37 #include <QAtomicInt>
38 #include <QTextDocument>
39 #include <QCryptographicHash>
40 
41 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
42 #include <QRegularExpression>
43 #endif
44 
45 #include "qt4x5.hh"
46 
47 namespace Mdx
48 {
49 
50 using std::map;
51 using std::multimap;
52 using std::set;
53 using gd::wstring;
54 using gd::wchar;
55 using std::list;
56 using std::pair;
57 using std::string;
58 
59 using BtreeIndexing::WordArticleLink;
60 using BtreeIndexing::IndexedWords;
61 using BtreeIndexing::IndexInfo;
62 
63 using namespace Mdict;
64 
65 enum
66 {
67   kSignature = 0x4349444d,  // MDIC
68   kCurrentFormatVersion = 11 + BtreeIndexing::FormatVersion + Folding::Version
69 };
70 
71 DEF_EX( exCorruptDictionary, "dictionary file was tampered or corrupted", std::exception )
72 
73 struct IdxHeader
74 {
75   uint32_t signature; // First comes the signature, MDIC
76   uint32_t formatVersion; // File format version, currently 1.
77   uint32_t parserVersion; // Version of the parser used to parse the MDIC file.
78   // Version of the folding algorithm used when building
79   // index. If it's different from the current one,
80   // the file is to be rebuilt.
81   uint32_t foldingVersion;
82 
83   uint32_t articleCount; // Total number of articles, for informative purposes only
84   uint32_t wordCount; // Total number of words, for informative purposes only
85 
86   uint32_t isRightToLeft; // Right to left
87   uint32_t chunksOffset; // The offset to chunks' storage
88 
89   uint32_t descriptionAddress; // Address of the dictionary description in the chunks' storage
90   uint32_t descriptionSize; // Size of the description in the chunks' storage, 0 = no description
91 
92   uint32_t styleSheetAddress;
93   uint32_t styleSheetCount;
94 
95   uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
96   uint32_t indexRootOffset;
97 
98   uint32_t langFrom; // Source language
99   uint32_t langTo; // Target language
100 
101   uint32_t mddIndexInfosOffset; // address of IndexInfos for resource files (.mdd)
102   uint32_t mddIndexInfosCount; // count of IndexInfos for resource files
103 }
104 #ifndef _MSC_VER
105 __attribute__( ( packed ) )
106 #endif
107 ;
108 
109 // A helper method to read resources from .mdd file
110 class IndexedMdd: public BtreeIndexing::BtreeIndex
111 {
112   Mutex & idxMutex;
113   Mutex fileMutex;
114   ChunkedStorage::Reader & chunks;
115   QFile mddFile;
116   bool isFileOpen;
117 
118 public:
119 
IndexedMdd(Mutex & idxMutex,ChunkedStorage::Reader & chunks)120   IndexedMdd( Mutex & idxMutex, ChunkedStorage::Reader & chunks ):
121     idxMutex( idxMutex ),
122     chunks( chunks ),
123     isFileOpen( false )
124   {}
125 
126   /// Opens the index. The values are those previously returned by buildIndex().
127   using BtreeIndexing::BtreeIndex::openIndex;
128 
129   /// Opens the mdd file itself. Returns true if succeeded, false otherwise.
open(const char * fileName)130   bool open( const char * fileName )
131   {
132     mddFile.setFileName( QString::fromUtf8( fileName ) );
133     isFileOpen = mddFile.open( QFile::ReadOnly );
134     return isFileOpen;
135   }
136 
137   /// Returns true if the mdd is open, false otherwise.
isOpen() const138   inline bool isOpen() const
139   {
140     return isFileOpen;
141   }
142 
143   /// Checks whether the given file exists in the mdd file or not.
144   /// Note that this function is thread-safe, since it does not access mdd file.
hasFile(gd::wstring const & name)145   bool hasFile( gd::wstring const & name )
146   {
147     if ( !isFileOpen )
148       return false;
149     vector< WordArticleLink > links = findArticles( name );
150     return !links.empty();
151   }
152 
153   /// Attempts loading the given file into the given vector. Returns true on
154   /// success, false otherwise.
loadFile(gd::wstring const & name,std::vector<char> & result)155   bool loadFile( gd::wstring const & name, std::vector< char > & result )
156   {
157     if ( !isFileOpen )
158       return false;
159 
160     vector< WordArticleLink > links = findArticles( name );
161     if ( links.empty() )
162       return false;
163 
164     MdictParser::RecordInfo indexEntry;
165     vector< char > chunk;
166     Mutex::Lock _( idxMutex );
167     const char * indexEntryPtr = chunks.getBlock( links[ 0 ].articleOffset, chunk );
168     memcpy( &indexEntry, indexEntryPtr, sizeof( indexEntry ) );
169 
170     ScopedMemMap compressed( mddFile, indexEntry.compressedBlockPos, indexEntry.compressedBlockSize );
171     if ( !compressed.startAddress() )
172     {
173       return false;
174     }
175 
176     QByteArray decompressed;
177     if ( !MdictParser::parseCompressedBlock( indexEntry.compressedBlockSize, ( char * )compressed.startAddress(),
178                                              indexEntry.decompressedBlockSize, decompressed ) )
179     {
180       return false;
181     }
182 
183     result.resize( indexEntry.recordSize );
184     memcpy( &result.front(), decompressed.constData() + indexEntry.recordOffset, indexEntry.recordSize );
185     return true;
186   }
187 
188 };
189 
190 class MdxDictionary: public BtreeIndexing::BtreeDictionary
191 {
192   Mutex idxMutex;
193   File::Class idx;
194   IdxHeader idxHeader;
195   string dictionaryName;
196   string encoding;
197   ChunkedStorage::Reader chunks;
198   QFile dictFile;
199   vector< sptr< IndexedMdd > > mddResources;
200   MdictParser::StyleSheets styleSheets;
201 
202   QAtomicInt deferredInitDone;
203   Mutex deferredInitMutex;
204   bool deferredInitRunnableStarted;
205   QSemaphore deferredInitRunnableExited;
206 
207   string initError;
208 
209 public:
210 
211   MdxDictionary( string const & id, string const & indexFile, vector<string> const & dictionaryFiles );
212 
213   ~MdxDictionary();
214 
215   virtual void deferredInit();
216 
getName()217   virtual string getName() throw()
218   {
219     return dictionaryName;
220   }
221 
getProperties()222   virtual map< Dictionary::Property, string > getProperties() throw()
223   {
224     return map< Dictionary::Property, string >();
225   }
226 
getArticleCount()227   virtual unsigned long getArticleCount() throw()
228   {
229     return idxHeader.articleCount;
230   }
231 
getWordCount()232   virtual unsigned long getWordCount() throw()
233   {
234     return idxHeader.wordCount;
235   }
236 
getLangFrom() const237   inline virtual quint32 getLangFrom() const
238   {
239     return idxHeader.langFrom;
240   }
241 
getLangTo() const242   inline virtual quint32 getLangTo() const
243   {
244     return idxHeader.langTo;
245   }
246 
247   virtual sptr< Dictionary::DataRequest > getArticle( wstring const & word,
248                                                       vector< wstring > const & alts,
249                                                       wstring const &,
250                                                       bool ignoreDiacritics ) THROW_SPEC( std::exception );
251   virtual sptr< Dictionary::DataRequest > getResource( string const & name ) THROW_SPEC( std::exception );
252   virtual QString const & getDescription();
253 
254   virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
255                                                             int searchMode, bool matchCase,
256                                                             int distanceBetweenWords,
257                                                             int maxResults,
258                                                             bool ignoreWordsOrder,
259                                                             bool ignoreDiacritics );
260   virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
261 
262   virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration );
263 
setFTSParameters(Config::FullTextSearch const & fts)264   virtual void setFTSParameters( Config::FullTextSearch const & fts )
265   {
266     if( ensureInitDone().size() )
267       return;
268 
269     can_FTS = fts.enabled
270               && !fts.disabledTypes.contains( "MDICT", Qt::CaseInsensitive )
271               && ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
272   }
273 protected:
274 
275   virtual void loadIcon() throw();
276 
277 private:
278 
279   virtual string const & ensureInitDone();
280   void doDeferredInit();
281 
282   /// Loads an article with the given offset, filling the given strings.
283   void loadArticle( uint32_t offset, string & articleText, bool noFilter = false );
284 
285   /// Process resource links (images, audios, etc)
286   QString & filterResource( QString const & articleId, QString & article );
287 
288   friend class MdxHeadwordsRequest;
289   friend class MdxArticleRequest;
290   friend class MddResourceRequest;
291   friend class MdxDeferredInitRunnable;
292 };
293 
MdxDictionary(string const & id,string const & indexFile,vector<string> const & dictionaryFiles)294 MdxDictionary::MdxDictionary( string const & id, string const & indexFile,
295                               vector<string> const & dictionaryFiles ):
296   BtreeDictionary( id, dictionaryFiles ),
297   idx( indexFile, "rb" ),
298   idxHeader( idx.read< IdxHeader >() ),
299   chunks( idx, idxHeader.chunksOffset ),
300   deferredInitRunnableStarted( false )
301 {
302   // Read the dictionary's name
303   idx.seek( sizeof( idxHeader ) );
304   size_t len = idx.read< uint32_t >();
305   vector< char > buf( len );
306   if( len > 0 )
307   {
308     idx.read( &buf.front(), len );
309     dictionaryName = string( &buf.front(), len );
310   }
311 
312   // then read the dictionary's encoding
313   len = idx.read< uint32_t >();
314   if( len > 0 )
315   {
316     buf.resize( len );
317     idx.read( &buf.front(), len );
318     encoding = string( &buf.front(), len );
319   }
320 
321   dictFile.setFileName( QString::fromUtf8( dictionaryFiles[ 0 ].c_str() ) );
322   dictFile.open( QIODevice::ReadOnly );
323 
324   // Full-text search parameters
325 
326   can_FTS = true;
327 
328   ftsIdxName = indexFile + "_FTS";
329 
330   if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
331       && !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
332     FTS_index_completed.ref();
333 }
334 
~MdxDictionary()335 MdxDictionary::~MdxDictionary()
336 {
337   Mutex::Lock _( deferredInitMutex );
338 
339   // Wait for init runnable to complete if it was ever started
340   if ( deferredInitRunnableStarted )
341     deferredInitRunnableExited.acquire();
342 
343   dictFile.close();
344 }
345 
346 //////// MdxDictionary::deferredInit()
347 
348 class MdxDeferredInitRunnable: public QRunnable
349 {
350   MdxDictionary & dictionary;
351   QSemaphore & hasExited;
352 
353 public:
354 
MdxDeferredInitRunnable(MdxDictionary & dictionary_,QSemaphore & hasExited_)355   MdxDeferredInitRunnable( MdxDictionary & dictionary_,
356                            QSemaphore & hasExited_ ):
357     dictionary( dictionary_ ), hasExited( hasExited_ )
358   {}
359 
~MdxDeferredInitRunnable()360   ~MdxDeferredInitRunnable()
361   {
362     hasExited.release();
363   }
364 
run()365   virtual void run()
366   {
367     dictionary.doDeferredInit();
368   }
369 };
370 
deferredInit()371 void MdxDictionary::deferredInit()
372 {
373   if ( !Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
374   {
375     Mutex::Lock _( deferredInitMutex );
376 
377     if ( Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
378       return;
379 
380     if ( !deferredInitRunnableStarted )
381     {
382       QThreadPool::globalInstance()->start(
383         new MdxDeferredInitRunnable( *this, deferredInitRunnableExited ),
384         -1000 );
385       deferredInitRunnableStarted = true;
386     }
387   }
388 }
389 
ensureInitDone()390 string const & MdxDictionary::ensureInitDone()
391 {
392   doDeferredInit();
393   return initError;
394 }
395 
doDeferredInit()396 void MdxDictionary::doDeferredInit()
397 {
398   if ( !Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
399   {
400     Mutex::Lock _( deferredInitMutex );
401 
402     if ( Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
403       return;
404 
405     // Do deferred init
406 
407     try
408     {
409       // Retrieve stylesheets
410       idx.seek( idxHeader.styleSheetAddress );
411       for ( uint32_t i = 0; i < idxHeader.styleSheetCount; i++ )
412       {
413         qint32 key = idx.read< qint32 >();
414         vector< char > buf;
415         quint32 sz;
416 
417         sz = idx.read< quint32 >();
418         buf.resize( sz );
419         idx.read( &buf.front(), sz );
420         QString styleBegin = QString::fromUtf8( buf.data() );
421 
422         sz = idx.read< quint32 >();
423         buf.resize( sz );
424         idx.read( &buf.front(), sz );
425         QString styleEnd = QString::fromUtf8( buf.data() );
426 
427         styleSheets[ key ] = pair<QString, QString>( styleBegin, styleEnd );
428       }
429 
430       // Initialize the index
431       openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
432                             idxHeader.indexRootOffset ), idx, idxMutex );
433 
434       vector< string > mddFileNames;
435       vector< IndexInfo > mddIndexInfos;
436       idx.seek( idxHeader.mddIndexInfosOffset );
437       for ( uint32_t i = 0; i < idxHeader.mddIndexInfosCount; i++ )
438       {
439         quint32 sz = idx.read< quint32 >();
440         vector< char > buf( sz );
441         idx.read( &buf.front(), sz );
442         uint32_t btreeMaxElements = idx.read<uint32_t>();
443         uint32_t rootOffset = idx.read<uint32_t>();
444         mddFileNames.push_back( string( &buf.front() ) );
445         mddIndexInfos.push_back( IndexInfo( btreeMaxElements, rootOffset ) );
446       }
447 
448       vector< string > const dictFiles = getDictionaryFilenames();
449       for ( uint32_t i = 1; i < dictFiles.size() && i < mddFileNames.size() + 1; i++ )
450       {
451         QFileInfo fi( QString::fromUtf8( dictFiles[ i ].c_str() ) );
452         QString mddFileName = QString::fromUtf8( mddFileNames[ i - 1 ].c_str() );
453 
454         if ( fi.fileName() != mddFileName || !fi.exists() )
455           continue;
456 
457         sptr< IndexedMdd > mdd = new IndexedMdd( idxMutex, chunks );
458         mdd->openIndex( mddIndexInfos[ i - 1 ], idx, idxMutex );
459         mdd->open( dictFiles[ i ].c_str() );
460         mddResources.push_back( mdd );
461       }
462     }
463     catch ( std::exception & e )
464     {
465       initError = e.what();
466     }
467     catch ( ... )
468     {
469       initError = "Unknown error";
470     }
471 
472     deferredInitDone.ref();
473   }
474 }
475 
makeFTSIndex(QAtomicInt & isCancelled,bool firstIteration)476 void MdxDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
477 {
478   if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
479          || FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
480     FTS_index_completed.ref();
481 
482   if( haveFTSIndex() )
483     return;
484 
485   if( ensureInitDone().size() )
486     return;
487 
488   if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
489     return;
490 
491   gdDebug( "MDict: Building the full-text index for dictionary: %s\n",
492            getName().c_str() );
493 
494   try
495   {
496     FtsHelpers::makeFTSIndex( this, isCancelled );
497     FTS_index_completed.ref();
498   }
499   catch( std::exception &ex )
500   {
501     gdWarning( "MDict: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
502     QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
503   }
504 }
505 
getArticleText(uint32_t articleAddress,QString & headword,QString & text)506 void MdxDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
507 {
508   try
509   {
510     headword.clear();
511     string articleText;
512 
513     loadArticle( articleAddress, articleText, true );
514     text = Html::unescape( QString::fromUtf8( articleText.data(), articleText.size() ) );
515   }
516   catch( std::exception &ex )
517   {
518     gdWarning( "MDict: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
519   }
520 }
521 
getSearchResults(QString const & searchString,int searchMode,bool matchCase,int distanceBetweenWords,int maxResults,bool ignoreWordsOrder,bool ignoreDiacritics)522 sptr< Dictionary::DataRequest > MdxDictionary::getSearchResults( QString const & searchString,
523                                                                  int searchMode, bool matchCase,
524                                                                  int distanceBetweenWords,
525                                                                  int maxResults,
526                                                                  bool ignoreWordsOrder,
527                                                                  bool ignoreDiacritics )
528 {
529   return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics );
530 }
531 
532 /// MdxDictionary::getArticle
533 
534 class MdxArticleRequest;
535 
536 class MdxArticleRequestRunnable: public QRunnable
537 {
538   MdxArticleRequest & r;
539   QSemaphore & hasExited;
540 
541 public:
542 
MdxArticleRequestRunnable(MdxArticleRequest & r_,QSemaphore & hasExited_)543   MdxArticleRequestRunnable( MdxArticleRequest & r_,
544                              QSemaphore & hasExited_ ):
545     r( r_ ),
546     hasExited( hasExited_ )
547   {}
548 
~MdxArticleRequestRunnable()549   ~MdxArticleRequestRunnable()
550   {
551     hasExited.release();
552   }
553 
554   virtual void run();
555 };
556 
557 class MdxArticleRequest: public Dictionary::DataRequest
558 {
559   friend class MdxArticleRequestRunnable;
560 
561   wstring word;
562   vector< wstring > alts;
563   MdxDictionary & dict;
564   bool ignoreDiacritics;
565 
566   QAtomicInt isCancelled;
567   QSemaphore hasExited;
568 
569 public:
570 
MdxArticleRequest(wstring const & word_,vector<wstring> const & alts_,MdxDictionary & dict_,bool ignoreDiacritics_)571   MdxArticleRequest( wstring const & word_,
572                      vector< wstring > const & alts_,
573                      MdxDictionary & dict_,
574                      bool ignoreDiacritics_ ):
575     word( word_ ),
576     alts( alts_ ),
577     dict( dict_ ),
578     ignoreDiacritics( ignoreDiacritics_ )
579   {
580     QThreadPool::globalInstance()->start( new MdxArticleRequestRunnable( *this, hasExited ) );
581   }
582 
583   void run();
584 
cancel()585   virtual void cancel()
586   {
587     isCancelled.ref();
588   }
589 
~MdxArticleRequest()590   ~MdxArticleRequest()
591   {
592     isCancelled.ref();
593     hasExited.acquire();
594   }
595 };
596 
run()597 void MdxArticleRequestRunnable::run()
598 {
599   r.run();
600 }
601 
run()602 void MdxArticleRequest::run()
603 {
604   if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
605   {
606     finish();
607     return;
608   }
609 
610   if ( dict.ensureInitDone().size() )
611   {
612     setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
613     finish();
614     return;
615   }
616 
617   vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
618 
619   for ( unsigned x = 0; x < alts.size(); ++x )
620   {
621     /// Make an additional query for each alt
622     vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
623     chain.insert( chain.end(), altChain.begin(), altChain.end() );
624   }
625 
626   // Some synonims make it that the articles appear several times. We combat this
627   // by only allowing them to appear once.
628   set< uint32_t > articlesIncluded;
629   // Sometimes the articles are physically duplicated. We store hashes of
630   // the bodies to account for this.
631   set< QByteArray > articleBodiesIncluded;
632   string articleText;
633 
634   for ( unsigned x = 0; x < chain.size(); ++x )
635   {
636     if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
637     {
638       finish();
639       return;
640     }
641 
642     if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
643       continue; // We already have this article in the body.
644 
645     // Grab that article
646     string articleBody;
647     bool hasError = false;
648     QString errorMessage;
649 
650     try
651     {
652       dict.loadArticle( chain[ x ].articleOffset, articleBody );
653     }
654     catch ( exCorruptDictionary & )
655     {
656       errorMessage = tr( "Dictionary file was tampered or corrupted" );
657       hasError = true;
658     }
659     catch ( std::exception & e )
660     {
661       errorMessage = e.what();
662       hasError = true;
663     }
664 
665     if ( hasError )
666     {
667       setErrorString( tr( "Failed loading article from %1, reason: %2" )
668                       .arg( QString::fromUtf8( dict.getDictionaryFilenames()[ 0 ].c_str() ) )
669                       .arg( errorMessage ) );
670       finish();
671       return;
672     }
673 
674     if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
675       continue; // We already have this article in the body.
676 
677     QCryptographicHash hash( QCryptographicHash::Md5 );
678     hash.addData( articleBody.data(), articleBody.size() );
679     if ( !articleBodiesIncluded.insert( hash.result() ).second )
680       continue; // Already had this body
681 
682     // Handle internal redirects
683     if ( strncmp( articleBody.c_str(), "@@@LINK=", 8 ) == 0 )
684     {
685       wstring target = Utf8::decode( articleBody.c_str() + 8 );
686       target = Folding::trimWhitespace( target );
687       // Make an additional query for this redirection
688       vector< WordArticleLink > altChain = dict.findArticles( target );
689       chain.insert( chain.end(), altChain.begin(), altChain.end() );
690       continue;
691     }
692 
693     // See Issue #271: A mechanism to clean-up invalid HTML cards.
694     string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
695                      "</font>""</font>""</font>""</font>""</font>""</font>"
696                      "</b></b></b></b></b></b></b></b>"
697                      "</i></i></i></i></i></i></i></i>"
698                      "</a></a></a></a></a></a></a></a>";
699     articleText += "<div class=\"mdict\">" + articleBody + cleaner + "</div>\n";
700   }
701 
702   if ( !articleText.empty() )
703   {
704     Mutex::Lock _( dataMutex );
705     data.insert( data.end(), articleText.begin(), articleText.end() );
706     hasAnyData = true;
707   }
708 
709   finish();
710 }
711 
getArticle(const wstring & word,const vector<wstring> & alts,const wstring &,bool ignoreDiacritics)712 sptr<Dictionary::DataRequest> MdxDictionary::getArticle( const wstring & word, const vector<wstring> & alts,
713                                                          const wstring &, bool ignoreDiacritics ) THROW_SPEC( std::exception )
714 {
715   return new MdxArticleRequest( word, alts, *this, ignoreDiacritics );
716 }
717 
718 /// MdxDictionary::getResource
719 
720 class MddResourceRequest;
721 
722 class MddResourceRequestRunnable: public QRunnable
723 {
724   MddResourceRequest & r;
725   QSemaphore & hasExited;
726 
727 public:
728 
MddResourceRequestRunnable(MddResourceRequest & r_,QSemaphore & hasExited_)729   MddResourceRequestRunnable( MddResourceRequest & r_,
730                               QSemaphore & hasExited_ ): r( r_ ),
731     hasExited( hasExited_ )
732   {}
733 
~MddResourceRequestRunnable()734   ~MddResourceRequestRunnable()
735   {
736     hasExited.release();
737   }
738 
739   virtual void run();
740 };
741 
742 class MddResourceRequest: public Dictionary::DataRequest
743 {
744   friend class MddResourceRequestRunnable;
745 
746   MdxDictionary & dict;
747   wstring resourceName;
748   QAtomicInt isCancelled;
749   QSemaphore hasExited;
750 
751 public:
752 
MddResourceRequest(MdxDictionary & dict_,string const & resourceName_)753   MddResourceRequest( MdxDictionary & dict_,
754                       string const & resourceName_ ):
755     dict( dict_ ),
756     resourceName( Utf8::decode( resourceName_ ) )
757   {
758     QThreadPool::globalInstance()->start( new MddResourceRequestRunnable( *this, hasExited ) );
759   }
760 
761   void run(); // Run from another thread by MddResourceRequestRunnable
762 
cancel()763   virtual void cancel()
764   {
765     isCancelled.ref();
766   }
767 
~MddResourceRequest()768   ~MddResourceRequest()
769   {
770     isCancelled.ref();
771     hasExited.acquire();
772   }
773 };
774 
run()775 void MddResourceRequestRunnable::run()
776 {
777   r.run();
778 }
779 
run()780 void MddResourceRequest::run()
781 {
782   if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
783   {
784     finish();
785     return;
786   }
787 
788   if ( dict.ensureInitDone().size() )
789   {
790     setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
791     finish();
792     return;
793   }
794 
795   // In order to prevent recursive internal redirection...
796   set< QByteArray > resourceIncluded;
797 
798   for ( ;; )
799   {
800     // Some runnables linger enough that they are cancelled before they start
801     if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
802     {
803       finish();
804       return;
805     }
806 
807     string u8ResourceName = Utf8::encode( resourceName );
808     QCryptographicHash hash( QCryptographicHash::Md5 );
809     hash.addData( u8ResourceName.data(), u8ResourceName.size() );
810     if ( !resourceIncluded.insert( hash.result() ).second )
811       continue;
812 
813     // Convert to the Windows separator
814     std::replace( resourceName.begin(), resourceName.end(), '/', '\\' );
815     if ( resourceName[ 0 ] != '\\' )
816     {
817       resourceName.insert( 0, 1, '\\' );
818     }
819 
820     Mutex::Lock _( dataMutex );
821     data.clear();
822 
823     try
824     {
825       // local file takes precedence
826       string fn = FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) +
827                   FsEncoding::separator() + u8ResourceName;
828       File::loadFromFile( fn, data );
829     }
830     catch ( File::exCantOpen & )
831     {
832       for ( vector< sptr< IndexedMdd > >::const_iterator i = dict.mddResources.begin();
833             i != dict.mddResources.end(); i++  )
834       {
835         sptr< IndexedMdd > mddResource = *i;
836         if ( mddResource->loadFile( resourceName, data ) )
837           break;
838       }
839     }
840 
841     // Check if this file has a redirection
842     // Always encoded in UTF16-LE
843     // L"@@@LINK="
844     static const char pattern[16] =
845     {
846       '@', '\0', '@', '\0', '@', '\0', 'L', '\0', 'I', '\0', 'N', '\0', 'K', '\0', '=', '\0'
847     };
848 
849     if ( data.size() > sizeof( pattern ) )
850     {
851       if ( memcmp( &data.front(),  pattern, sizeof( pattern ) ) == 0 )
852       {
853         data.push_back( '\0' );
854         data.push_back( '\0' );
855         QString target = MdictParser::toUtf16( "UTF-16LE", &data.front() + sizeof( pattern ),
856                                                data.size() - sizeof( pattern ) );
857         resourceName = gd::toWString( target.trimmed() );
858         continue;
859       }
860     }
861 
862     if ( data.size() > 0 )
863     {
864       hasAnyData = true;
865 
866       if ( Filetype::isNameOfCSS( u8ResourceName ) )
867       {
868         QString css = QString::fromUtf8( data.data(), data.size() );
869 
870 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
871         QRegularExpression links( "url\\(\\s*(['\"]?)([^'\"]*)(['\"]?)\\s*\\)",
872                                   QRegularExpression::CaseInsensitiveOption );
873 #else
874         QRegExp links( "url\\(\\s*(['\"]?)([^'\"]*)(['\"]?)\\s*\\)", Qt::CaseInsensitive, QRegExp::RegExp );
875 #endif
876         QString id = QString::fromUtf8( dict.getId().c_str() );
877         int pos = 0;
878 
879 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
880         QString newCSS;
881         QRegularExpressionMatchIterator it = links.globalMatch( css );
882         while ( it.hasNext() )
883         {
884           QRegularExpressionMatch match = it.next();
885           newCSS += css.midRef( pos, match.capturedStart() - pos );
886           pos = match.capturedEnd();
887           QString url = match.captured( 2 );
888 #else
889         for( ; ; )
890         {
891           pos = links.indexIn( css, pos );
892           if( pos < 0 )
893             break;
894           QString url = links.cap( 2 );
895 #endif
896 
897           if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0)
898           {
899             // External link or base64-encoded data
900 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
901             newCSS += match.captured();
902 #else
903             pos += links.cap().size();
904 #endif
905             continue;
906           }
907 
908 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
909           QString newUrl = QString( "url(" ) + match.captured( 1 ) + "bres://"
910                                              + id + "/" + url + match.captured( 3 ) + ")";
911           newCSS += newUrl;
912 #else
913           QString newUrl = QString( "url(" ) + links.cap( 1 ) + "bres://"
914                                              + id + "/" + url + links.cap( 3 ) + ")";
915           css.replace( pos, links.cap().size(), newUrl );
916           pos += newUrl.size();
917 #endif
918         }
919 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
920         if( pos )
921         {
922           newCSS += css.midRef( pos );
923           css = newCSS;
924           newCSS.clear();
925         }
926 #endif
927         dict.isolateCSS( css, ".mdict" );
928         QByteArray bytes = css.toUtf8();
929         data.resize( bytes.size() );
930         memcpy( &data.front(), bytes.constData(), bytes.size() );
931       }
932     }
933     break;
934   }
935 
936   finish();
937 }
938 
939 sptr<Dictionary::DataRequest> MdxDictionary::getResource( const string & name ) THROW_SPEC( std::exception )
940 {
941   return new MddResourceRequest( *this, name );
942 }
943 
944 const QString & MdxDictionary::getDescription()
945 {
946   if ( !dictionaryDescription.isEmpty() )
947     return dictionaryDescription;
948 
949   if ( idxHeader.descriptionSize == 0 )
950   {
951     dictionaryDescription = "NONE";
952   }
953   else
954   {
955     Mutex::Lock _( idxMutex );
956     vector< char > chunk;
957     char * dictDescription = chunks.getBlock( idxHeader.descriptionAddress, chunk );
958     string str( dictDescription );
959     dictionaryDescription = QString::fromUtf8( str.c_str(), str.size() );
960   }
961 
962   return dictionaryDescription;
963 }
964 
965 void MdxDictionary::loadIcon() throw()
966 {
967   if ( dictionaryIconLoaded )
968     return;
969 
970   QString fileName =
971     QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
972 
973   // Remove the extension
974   fileName.chop( 3 );
975 
976   if ( !loadIconFromFile( fileName ) )
977   {
978     // Use default icons
979     dictionaryIcon = dictionaryNativeIcon = QIcon( ":/icons/mdict.png" );
980   }
981 
982   dictionaryIconLoaded = true;
983 }
984 
985 void MdxDictionary::loadArticle( uint32_t offset, string & articleText, bool noFilter )
986 {
987   vector< char > chunk;
988   Mutex::Lock _( idxMutex );
989 
990   // Load record info from index
991   MdictParser::RecordInfo recordInfo;
992   char * pRecordInfo = chunks.getBlock( offset, chunk );
993   memcpy( &recordInfo, pRecordInfo, sizeof( recordInfo ) );
994 
995   // Make a sub unique id for this article
996   QString articleId;
997   articleId.setNum( ( quint64 )pRecordInfo, 16 );
998 
999   ScopedMemMap compressed( dictFile, recordInfo.compressedBlockPos, recordInfo.compressedBlockSize );
1000   if ( !compressed.startAddress() )
1001     throw exCorruptDictionary();
1002 
1003   QByteArray decompressed;
1004   if ( !MdictParser::parseCompressedBlock( recordInfo.compressedBlockSize, ( char * )compressed.startAddress(),
1005                                            recordInfo.decompressedBlockSize, decompressed ) )
1006     throw exCorruptDictionary();
1007 
1008   QString article = MdictParser::toUtf16( encoding.c_str(),
1009                                           decompressed.constData() + recordInfo.recordOffset,
1010                                           recordInfo.recordSize );
1011 
1012   article = MdictParser::substituteStylesheet( article, styleSheets );
1013 
1014   if( !noFilter )
1015     article = filterResource( articleId, article );
1016 
1017   // Check for unclosed <span> and <div>
1018 
1019   int openTags = article.count( QRegExp( "<\\s*span\\b", Qt::CaseInsensitive ) );
1020   int closedTags = article.count( QRegExp( "<\\s*/span\\s*>", Qt::CaseInsensitive ) );
1021   while( openTags > closedTags )
1022   {
1023     article += "</span>";
1024     closedTags += 1;
1025   }
1026 
1027   openTags = article.count( QRegExp( "<\\s*div\\b", Qt::CaseInsensitive ) );
1028   closedTags = article.count( QRegExp( "<\\s*/div\\s*>", Qt::CaseInsensitive ) );
1029   while( openTags > closedTags )
1030   {
1031     article += "</div>";
1032     closedTags += 1;
1033   }
1034 
1035   articleText = string( article.toUtf8().constData() );
1036 }
1037 
1038 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
1039 QString & MdxDictionary::filterResource( QString const & articleId, QString & article )
1040 {
1041   QString id = QString::fromStdString( getId() );
1042   QString uniquePrefix = QString::fromLatin1( "g" ) + id + "_" + articleId + "_";
1043 
1044   QRegularExpression allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script)(?:\\s+[^>]+|\\s*)>)",
1045                                  QRegularExpression::CaseInsensitiveOption );
1046   QRegularExpression wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*?)((?:#[^>]*?)?)\\2",
1047                                     QRegularExpression::CaseInsensitiveOption );
1048   QRegularExpression anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)",
1049                                  QRegularExpression::CaseInsensitiveOption );
1050   QRegularExpression anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)",
1051                                   QRegularExpression::CaseInsensitiveOption );
1052   QRegularExpression anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#",
1053                                    QRegularExpression::CaseInsensitiveOption );
1054   QRegularExpression audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2",
1055                               QRegularExpression::CaseInsensitiveOption
1056                               | QRegularExpression::InvertedGreedinessOption );
1057   QRegularExpression stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
1058                                QRegularExpression::CaseInsensitiveOption );
1059   QRegularExpression stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
1060                                 QRegularExpression::CaseInsensitiveOption );
1061   QRegularExpression inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>",
1062                                      QRegularExpression::CaseInsensitiveOption );
1063   QRegularExpression closeScriptTagRe( "<\\s*/script\\s*>",
1064                                        QRegularExpression::CaseInsensitiveOption );
1065   QRegularExpression srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
1066                             QRegularExpression::CaseInsensitiveOption );
1067   QRegularExpression srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
1068                              QRegularExpression::CaseInsensitiveOption );
1069 
1070   QString articleNewText;
1071   int linkPos = 0;
1072   QRegularExpressionMatchIterator it = allLinksRe.globalMatch( article );
1073   while( it.hasNext() )
1074   {
1075     QRegularExpressionMatch allLinksMatch = it.next();
1076 
1077     if( allLinksMatch.capturedEnd() < linkPos )
1078       continue;
1079 
1080     articleNewText += article.midRef( linkPos, allLinksMatch.capturedStart() - linkPos );
1081     linkPos = allLinksMatch.capturedEnd();
1082 
1083     QString linkTxt = allLinksMatch.captured();
1084     QString linkType = allLinksMatch.captured( 1 ).toLower();
1085     QString newLink;
1086 
1087     if( !linkType.isEmpty() && linkType.at( 0 ) == 'a' )
1088     {
1089       QRegularExpressionMatch match = anchorIdRe.match( linkTxt );
1090       if( match.hasMatch() )
1091       {
1092         QString newText = match.captured( 1 ) + match.captured( 2 ) + uniquePrefix;
1093         newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
1094       }
1095       else
1096         newLink = linkTxt.replace( anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" );
1097 
1098       newLink = newLink.replace( anchorLinkRe, "\\1#" + uniquePrefix );
1099 
1100       match = audioRe.match( newLink );
1101       if( match.hasMatch() )
1102       {
1103         // sounds and audio link script
1104         QString newTxt = match.captured( 1 ) + match.captured( 2 )
1105                          + "gdau://" + id + "/"
1106                          + match.captured( 3 ) + match.captured( 2 );
1107         newLink = QString::fromUtf8( addAudioLink( "\"gdau://" + getId() + "/" + match.captured( 3 ).toUtf8().data() + "\"", getId() ).c_str() )
1108                   + newLink.replace( match.capturedStart(), match.capturedLength(), newTxt );
1109       }
1110 
1111       match = wordCrossLink.match( newLink );
1112       if( match.hasMatch() )
1113       {
1114         QString newTxt = match.captured( 1 ) + match.captured( 2 )
1115                          + "gdlookup://localhost/"
1116                          + match.captured( 3 );
1117 
1118         if( match.lastCapturedIndex() >= 4 && !match.captured( 4 ).isEmpty() )
1119           newTxt += QString( "?gdanchor=" ) + uniquePrefix + match.captured( 4 ).mid( 1 );
1120 
1121         newTxt += match.captured( 2 );
1122         newLink.replace( match.capturedStart(), match.capturedLength(), newTxt );
1123       }
1124     }
1125     else
1126     if( linkType.compare( "link" ) == 0 )
1127     {
1128       // stylesheets
1129       QRegularExpressionMatch match = stylesRe.match( linkTxt );
1130       if( match.hasMatch() )
1131       {
1132         QString newText = match.captured( 1 ) + match.captured( 2 )
1133                           + "bres://" + id + "/"
1134                           + match.captured( 3 ) + match.captured( 2 );
1135         newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
1136       }
1137       else
1138         newLink = linkTxt.replace( stylesRe2,
1139                                    "\\1\"bres://" + id + "/\\2\"" );
1140     }
1141     else
1142     if( linkType.compare( "script" ) == 0 || linkType.compare( "img" ) == 0 )
1143     {
1144       // javascripts and images
1145       QRegularExpressionMatch match = inlineScriptRe.match( linkTxt );
1146       if( linkType.at( 0 ) == 's'
1147           && match.hasMatch() && match.capturedLength() == linkTxt.length() )
1148       {
1149         // skip inline scripts
1150         articleNewText += linkTxt;
1151         match = closeScriptTagRe.match( article, linkPos );
1152         if( match.hasMatch() )
1153         {
1154           articleNewText += article.midRef( linkPos, match.capturedEnd() - linkPos );
1155           linkPos = match.capturedEnd();
1156         }
1157         continue;
1158       }
1159       else
1160       {
1161         match = srcRe.match( linkTxt );
1162         if( match.hasMatch() )
1163         {
1164           QString newText = match.captured( 1 ) + match.captured( 2 )
1165                             + "bres://" + id + "/"
1166                             + match.captured( 3 ) + match.captured( 2 );
1167           newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
1168         }
1169         else
1170           newLink = linkTxt.replace( srcRe2,
1171                                      "\\1\"bres://" + id + "/\\2\"" );
1172       }
1173     }
1174     if( !newLink.isEmpty() )
1175     {
1176       articleNewText += newLink;
1177     }
1178     else
1179       articleNewText += allLinksMatch.captured();
1180   }
1181   if( linkPos )
1182   {
1183     articleNewText += article.midRef( linkPos );
1184     article = articleNewText;
1185   }
1186 
1187   return article;
1188 }
1189 #else
1190 QString & MdxDictionary::filterResource( QString const & articleId, QString & article )
1191 {
1192   QString id = QString::fromStdString( getId() );
1193   QString uniquePrefix = QString::fromLatin1( "g" ) + id + "_" + articleId + "_";
1194 
1195   QRegExp allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script)(?:\\s+[^>]+|\\s*)>)", Qt::CaseInsensitive );
1196   QRegExp wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*)((?:#[^>]*)?)\\2", Qt::CaseInsensitive );
1197   wordCrossLink.setMinimal( true );
1198 
1199   QRegExp anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)", Qt::CaseInsensitive );
1200   QRegExp anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)", Qt::CaseInsensitive );
1201   QRegExp anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#", Qt::CaseInsensitive );
1202   QRegExp audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2", Qt::CaseInsensitive );
1203   audioRe.setMinimal( true );
1204 
1205   QRegExp stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
1206                     Qt::CaseInsensitive, QRegExp::RegExp2 );
1207   stylesRe.setMinimal( true );
1208   QRegExp stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
1209                      Qt::CaseInsensitive, QRegExp::RegExp2 );
1210   QRegExp inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>", Qt::CaseInsensitive, QRegExp::RegExp2 );
1211   QRegExp closeScriptTagRe( "<\\s*/script\\s*>", Qt::CaseInsensitive, QRegExp::RegExp2 );
1212   QRegExp srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
1213                      Qt::CaseInsensitive, QRegExp::RegExp2 );
1214   srcRe.setMinimal( true );
1215   QRegExp srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
1216                     Qt::CaseInsensitive, QRegExp::RegExp2 );
1217 
1218   int linkPos = 0;
1219   while( linkPos >= 0 )
1220   {
1221     linkPos = allLinksRe.indexIn( article, linkPos );
1222     if( linkPos < 0 )
1223       break;
1224 
1225     QString linkTxt = allLinksRe.cap( 0 );
1226     QString linkType = allLinksRe.cap( 1 ).toLower();
1227     QString newLink;
1228 
1229     if( !linkType.isEmpty() && linkType.at( 0 ) == 'a' )
1230     {
1231       int pos = anchorIdRe.indexIn( linkTxt );
1232       if( pos >= 0 )
1233       {
1234         QString newText = anchorIdRe.cap( 1 ) + anchorIdRe.cap( 2 ) + uniquePrefix;
1235         newLink = linkTxt.replace( pos, anchorIdRe.cap().length(), newText );
1236       }
1237       else
1238         newLink = linkTxt.replace( anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" );
1239 
1240       newLink = newLink.replace( anchorLinkRe, "\\1#" + uniquePrefix );
1241 
1242       pos = audioRe.indexIn( newLink );
1243       if( pos >= 0 )
1244       {
1245         // sounds and audio link script
1246         QString newTxt = audioRe.cap( 1 ) + audioRe.cap( 2 )
1247                          + "gdau://" + id + "/"
1248                          + audioRe.cap( 3 ) + audioRe.cap( 2 );
1249         newLink = QString::fromUtf8( addAudioLink( "\"gdau://" + getId() + "/" + audioRe.cap( 3 ).toUtf8().data() + "\"", getId() ).c_str() )
1250                   + newLink.replace( pos, audioRe.cap().length(), newTxt );
1251       }
1252 
1253       pos = wordCrossLink.indexIn( newLink );
1254       if( pos >= 0 )
1255       {
1256         QString newTxt = wordCrossLink.cap( 1 ) + wordCrossLink.cap( 2 )
1257                          + "gdlookup://localhost/"
1258                          + wordCrossLink.cap( 3 );
1259 
1260         if( !wordCrossLink.cap( 4 ).isEmpty() )
1261           newTxt += QString( "?gdanchor=" ) + uniquePrefix + wordCrossLink.cap( 4 ).mid( 1 );
1262 
1263         newTxt += wordCrossLink.cap( 2 );
1264         newLink.replace( pos, wordCrossLink.cap( 0 ).length(), newTxt );
1265       }
1266     }
1267     else
1268     if( linkType.compare( "link" ) == 0 )
1269     {
1270       // stylesheets
1271       int pos = stylesRe.indexIn( linkTxt );
1272       if( pos >= 0 )
1273       {
1274         QString newText = stylesRe.cap( 1 ) + stylesRe.cap( 2 )
1275                           + "bres://" + id + "/"
1276                           + stylesRe.cap( 3 ) + stylesRe.cap( 2 );
1277         newLink = linkTxt.replace( pos, stylesRe.cap().length(), newText );
1278       }
1279       else
1280         newLink = linkTxt.replace( stylesRe2,
1281                                    "\\1\"bres://" + id + "/\\2\"" );
1282     }
1283     else
1284     if( linkType.compare( "script" ) == 0 || linkType.compare( "img" ) == 0 )
1285     {
1286       // javascripts and images
1287       if( linkType.at( 0 ) == 's' && inlineScriptRe.exactMatch( linkTxt ) )
1288       {
1289         // skip inline scripts
1290         linkPos += linkTxt.length();
1291         int pos = closeScriptTagRe.indexIn( article, linkPos );
1292         if( pos > 0 )
1293           linkPos = pos + closeScriptTagRe.cap().length();
1294         continue;
1295       }
1296       else
1297       {
1298         int pos = srcRe.indexIn( linkTxt );
1299         if( pos >= 0 )
1300         {
1301           QString newText = srcRe.cap( 1 ) + srcRe.cap( 2 )
1302                             + "bres://" + id + "/"
1303                             + srcRe.cap( 3 ) + srcRe.cap( 2 );
1304           newLink = linkTxt.replace( pos, srcRe.cap().length(), newText );
1305         }
1306         else
1307           newLink = linkTxt.replace( srcRe2,
1308                                      "\\1\"bres://" + id + "/\\2\"" );
1309       }
1310     }
1311     if( !newLink.isEmpty() )
1312     {
1313       article.replace( linkPos, allLinksRe.cap().length(), newLink );
1314       linkPos += newLink.length();
1315     }
1316     else
1317       linkPos += allLinksRe.cap().length();
1318   }
1319 
1320   return article;
1321 }
1322 #endif
1323 
1324 static void addEntryToIndex( QString const & word, uint32_t offset, IndexedWords & indexedWords )
1325 {
1326   // Strip any leading or trailing whitespaces
1327   QString wordTrimmed = word.trimmed();
1328   indexedWords.addWord( gd::toWString( wordTrimmed ), offset );
1329 }
1330 
1331 static void addEntryToIndexSingle( QString const & word, uint32_t offset, IndexedWords & indexedWords )
1332 {
1333   // Strip any leading or trailing whitespaces
1334   QString wordTrimmed = word.trimmed();
1335   indexedWords.addSingleWord( gd::toWString( wordTrimmed ), offset );
1336 }
1337 
1338 class ArticleHandler: public MdictParser::RecordHandler
1339 {
1340 public:
1341   ArticleHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ) :
1342     chunks( chunks ),
1343     indexedWords( indexedWords )
1344   {
1345   }
1346 
1347   virtual void handleRecord( QString const & headWord, MdictParser::RecordInfo const & recordInfo )
1348   {
1349     // Save the article's record info
1350     uint32_t articleAddress = chunks.startNewBlock();
1351     chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
1352     // Add entries to the index
1353     addEntryToIndex( headWord, articleAddress, indexedWords );
1354   }
1355 
1356 private:
1357   ChunkedStorage::Writer & chunks;
1358   IndexedWords & indexedWords;
1359 };
1360 
1361 class ResourceHandler: public MdictParser::RecordHandler
1362 {
1363 public:
1364   ResourceHandler( ChunkedStorage::Writer & chunks, IndexedWords & indexedWords ):
1365     chunks( chunks ),
1366     indexedWords( indexedWords )
1367   {
1368   }
1369 
1370   virtual void handleRecord( QString const & fileName, MdictParser::RecordInfo const & recordInfo )
1371   {
1372     uint32_t resourceInfoAddress = chunks.startNewBlock();
1373     chunks.addToBlock( &recordInfo, sizeof( recordInfo ) );
1374     // Add entries to the index
1375     addEntryToIndexSingle( fileName, resourceInfoAddress, indexedWords );
1376   }
1377 
1378 private:
1379   ChunkedStorage::Writer & chunks;
1380   IndexedWords & indexedWords;
1381 };
1382 
1383 
1384 static bool indexIsOldOrBad( vector< string > const & dictFiles, string const & indexFile )
1385 {
1386   File::Class idx( indexFile, "rb" );
1387   IdxHeader header;
1388 
1389   return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
1390          header.signature != kSignature ||
1391          header.formatVersion != kCurrentFormatVersion ||
1392          header.parserVersion != MdictParser::kParserVersion ||
1393          header.foldingVersion != Folding::Version ||
1394          header.mddIndexInfosCount != dictFiles.size() - 1;
1395 }
1396 
1397 static void findResourceFiles( string const & mdx, vector< string > & dictFiles )
1398 {
1399   string base( mdx, 0, mdx.size() - 4 );
1400   // Check if there' is any file end with .mdd, which is the resource file for the dictionary
1401   string resFile;
1402   if ( File::tryPossibleName( base + ".mdd", resFile ) )
1403   {
1404     dictFiles.push_back( resFile );
1405     // Find complementary .mdd file (volumes), like follows:
1406     //   demo.mdx   <- main dictionary file
1407     //   demo.mdd   <- main resource file ( 1st volume )
1408     //   demo.1.mdd <- 2nd volume
1409     //   ...
1410     //   demo.n.mdd <- nth volume
1411     QString baseU8 = QString::fromUtf8( base.c_str() );
1412     int vol = 1;
1413     while ( File::tryPossibleName( string( QString( "%1.%2.mdd" ).arg( baseU8 ).arg( vol )
1414                                            .toUtf8().constBegin() ), resFile ) )
1415     {
1416       dictFiles.push_back( resFile );
1417       vol++;
1418     }
1419   }
1420 }
1421 
1422 vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames,
1423                                                       string const & indicesDir,
1424                                                       Dictionary::Initializing & initializing ) THROW_SPEC( std::exception )
1425 {
1426   vector< sptr< Dictionary::Class > > dictionaries;
1427 
1428   for ( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end(); i++ )
1429   {
1430     // Skip files with the extensions different to .mdx to speed up the
1431     // scanning
1432     if ( i->size() < 4 || strcasecmp( i->c_str() + ( i->size() - 4 ), ".mdx" ) != 0 )
1433       continue;
1434 
1435     vector< string > dictFiles( 1, *i );
1436     findResourceFiles( *i, dictFiles );
1437 
1438     string dictId = Dictionary::makeDictionaryId( dictFiles );
1439     string indexFile = indicesDir + dictId;
1440 
1441     if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
1442          indexIsOldOrBad( dictFiles, indexFile ) )
1443     {
1444       // Building the index
1445 
1446       gdDebug( "MDict: Building the index for dictionary: %s\n", i->c_str() );
1447 
1448       MdictParser parser;
1449       list< sptr< MdictParser > > mddParsers;
1450 
1451       if ( !parser.open( i->c_str() ) )
1452         continue;
1453 
1454       string title = string( parser.title().toUtf8().constData() );
1455       initializing.indexingDictionary( title );
1456 
1457       for ( vector< string >::const_iterator mddIter = dictFiles.begin() + 1;
1458             mddIter != dictFiles.end(); mddIter++ )
1459       {
1460         if ( File::exists( *mddIter ) )
1461         {
1462           sptr< MdictParser > mddParser = new MdictParser();
1463           if ( !mddParser->open( mddIter->c_str() ) )
1464           {
1465             gdWarning( "Broken mdd (resource) file: %s\n", mddIter->c_str() );
1466             continue;
1467           }
1468           mddParsers.push_back( mddParser );
1469         }
1470       }
1471 
1472       File::Class idx( indexFile, "wb" );
1473       IdxHeader idxHeader;
1474       memset( &idxHeader, 0, sizeof( idxHeader ) );
1475       // We write a dummy header first. At the end of the process the header
1476       // will be rewritten with the right values.
1477       idx.write( idxHeader );
1478 
1479       // Write the title first
1480       idx.write< uint32_t >( title.size() );
1481       idx.write( title.data(), title.size() );
1482 
1483       // then the encoding
1484       {
1485         string encoding = string( parser.encoding().toUtf8().constData() );
1486         idx.write< uint32_t >( encoding.size() );
1487         idx.write( encoding.data(), encoding.size() );
1488       }
1489 
1490       // This is our index data that we accumulate during the loading process.
1491       // For each new word encountered, we emit the article's body to the file
1492       // immediately, inserting the word itself and its offset in this map.
1493       // This map maps folded words to the original words and the corresponding
1494       // articles' offsets.
1495       IndexedWords indexedWords;
1496       ChunkedStorage::Writer chunks( idx );
1497 
1498       idxHeader.isRightToLeft = parser.isRightToLeft();
1499 
1500       // Save dictionary description if there's one
1501       {
1502         string description = string( parser.description().toUtf8().constData() );
1503         idxHeader.descriptionAddress = chunks.startNewBlock();
1504         chunks.addToBlock( description.c_str(), description.size() + 1 );
1505         idxHeader.descriptionSize = description.size() + 1;
1506       }
1507 
1508       ArticleHandler articleHandler( chunks, indexedWords );
1509       MdictParser::HeadWordIndex headWordIndex;
1510 
1511       // enumerating word and its definition
1512       while ( parser.readNextHeadWordIndex( headWordIndex ) )
1513       {
1514         parser.readRecordBlock( headWordIndex, articleHandler );
1515       }
1516 
1517       // enumerating resources if there's any
1518       vector< sptr< IndexedWords > > mddIndices;
1519       vector< string > mddFileNames;
1520       while ( !mddParsers.empty() )
1521       {
1522         sptr< MdictParser > mddParser = mddParsers.front();
1523         sptr< IndexedWords > mddIndexedWords = new IndexedWords();
1524         MdictParser::HeadWordIndex resourcesIndex;
1525         ResourceHandler resourceHandler( chunks, *mddIndexedWords );
1526 
1527         while ( mddParser->readNextHeadWordIndex( headWordIndex ) )
1528         {
1529           resourcesIndex.insert( resourcesIndex.end(), headWordIndex.begin(), headWordIndex.end() );
1530         }
1531         mddParser->readRecordBlock( resourcesIndex, resourceHandler );
1532 
1533         mddIndices.push_back( mddIndexedWords );
1534         // Save filename for .mdd files only
1535         QFileInfo fi( mddParser->filename() );
1536         mddFileNames.push_back( string( fi.fileName().toUtf8().constData() ) );
1537         mddParsers.pop_front();
1538       }
1539 
1540       // Finish with the chunks
1541       idxHeader.chunksOffset = chunks.finish();
1542 
1543       GD_DPRINTF( "Writing index...\n" );
1544 
1545       // Good. Now build the index
1546       IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
1547       idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
1548       idxHeader.indexRootOffset = idxInfo.rootOffset;
1549 
1550       // Save dictionary stylesheets
1551       {
1552         MdictParser::StyleSheets const & styleSheets = parser.styleSheets();
1553         idxHeader.styleSheetAddress = idx.tell();
1554         idxHeader.styleSheetCount = styleSheets.size();
1555 
1556         for ( MdictParser::StyleSheets::const_iterator iter = styleSheets.begin();
1557               iter != styleSheets.end(); iter++ )
1558         {
1559           string styleBegin( iter->second.first.toUtf8().constData() );
1560           string styleEnd( iter->second.second.toUtf8().constData() );
1561 
1562           // key
1563           idx.write<qint32>( iter->first );
1564           // styleBegin
1565           idx.write<quint32>( ( quint32 )styleBegin.size() + 1 );
1566           idx.write( styleBegin.c_str(), styleBegin.size() + 1 );
1567           // styleEnd
1568           idx.write<quint32>( ( quint32 )styleEnd.size() + 1 );
1569           idx.write( styleEnd.c_str(), styleEnd.size() + 1 );
1570         }
1571       }
1572 
1573       // read languages
1574       QPair<quint32, quint32> langs = LangCoder::findIdsForFilename( QString::fromStdString( *i ) );
1575 
1576       // if no languages found, try dictionary's name
1577       if ( langs.first == 0 || langs.second == 0 )
1578       {
1579         langs = LangCoder::findIdsForFilename( parser.title() );
1580       }
1581 
1582       idxHeader.langFrom = langs.first;
1583       idxHeader.langTo = langs.second;
1584 
1585       // Build index info for each mdd file
1586       vector< IndexInfo > mddIndexInfos;
1587       for ( vector< sptr< IndexedWords > >::const_iterator mddIndexIter = mddIndices.begin();
1588             mddIndexIter != mddIndices.end(); mddIndexIter++ )
1589       {
1590         IndexInfo resourceIdxInfo = BtreeIndexing::buildIndex( *( *mddIndexIter ), idx );
1591         mddIndexInfos.push_back( resourceIdxInfo );
1592       }
1593 
1594       // Save address of IndexInfos for resource files
1595       idxHeader.mddIndexInfosOffset = idx.tell();
1596       idxHeader.mddIndexInfosCount = mddIndexInfos.size();
1597       for ( uint32_t mi = 0; mi < mddIndexInfos.size(); mi++ )
1598       {
1599         const string & mddfile = mddFileNames[ mi ];
1600 
1601         idx.write<quint32>( ( quint32 )mddfile.size() + 1 );
1602         idx.write( mddfile.c_str(), mddfile.size() + 1 );
1603         idx.write<uint32_t>( mddIndexInfos[ mi ].btreeMaxElements );
1604         idx.write<uint32_t>( mddIndexInfos[ mi ].rootOffset );
1605       }
1606 
1607       // That concludes it. Update the header.
1608       idxHeader.signature = kSignature;
1609       idxHeader.formatVersion = kCurrentFormatVersion;
1610       idxHeader.parserVersion = MdictParser::kParserVersion;
1611       idxHeader.foldingVersion = Folding::Version;
1612       idxHeader.articleCount = parser.wordCount();
1613       idxHeader.wordCount = parser.wordCount();
1614 
1615       idx.rewind();
1616       idx.write( &idxHeader, sizeof( idxHeader ) );
1617     }
1618 
1619     dictionaries.push_back( new MdxDictionary( dictId, indexFile, dictFiles ) );
1620   }
1621 
1622   return dictionaries;
1623 }
1624 
1625 }
1626