1 /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2  * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
3 
4 #include "dsl.hh"
5 #include "dsl_details.hh"
6 #include "btreeidx.hh"
7 #include "folding.hh"
8 #include "utf8.hh"
9 #include "chunkedstorage.hh"
10 #include "dictzip.h"
11 #include "htmlescape.hh"
12 #include "iconv.hh"
13 #include "filetype.hh"
14 #include "fsencoding.hh"
15 #include "audiolink.hh"
16 #include "langcoder.hh"
17 #include "wstring_qt.hh"
18 #include "zipfile.hh"
19 #include "indexedzip.hh"
20 #include "gddebug.hh"
21 #include "tiff.hh"
22 #include "fulltextsearch.hh"
23 #include "ftshelpers.hh"
24 #include "language.hh"
25 
26 #include <zlib.h>
27 #include <map>
28 #include <set>
29 #include <string>
30 #include <vector>
31 #include <list>
32 #include <wctype.h>
33 
34 #ifdef _MSC_VER
35 #include <stub_msvc.h>
36 #endif
37 
38 #include <QSemaphore>
39 #include <QThreadPool>
40 #include <QAtomicInt>
41 #include <QUrl>
42 
43 #include <QDir>
44 #include <QFileInfo>
45 #include <QPainter>
46 #include <QMap>
47 #include <QStringList>
48 
49 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
50 #include <QRegularExpression>
51 #else
52 #include <QRegExp>
53 #endif
54 
55 // For TIFF conversion
56 #include <QImage>
57 #include <QByteArray>
58 #include <QBuffer>
59 
60 // For SVG handling
61 #include <QtSvg/QSvgRenderer>
62 
63 #include "qt4x5.hh"
64 
65 namespace Dsl {
66 
67 using namespace Details;
68 
69 using std::map;
70 using std::multimap;
71 using std::pair;
72 using std::set;
73 using std::string;
74 using gd::wstring;
75 using gd::wchar;
76 using std::vector;
77 using std::list;
78 
79 using BtreeIndexing::WordArticleLink;
80 using BtreeIndexing::IndexedWords;
81 using BtreeIndexing::IndexInfo;
82 
83 namespace {
84 
85 DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
86 DEF_EX( exUserAbort, "User abort", Dictionary::Ex )
87 DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex )
88 
89 enum
90 {
91   Signature = 0x584c5344, // DSLX on little-endian, XLSD on big-endian
92   CurrentFormatVersion = 23 + BtreeIndexing::FormatVersion + Folding::Version,
93   CurrentZipSupportVersion = 2,
94   CurrentFtsIndexVersion = 7
95 };
96 
97 struct IdxHeader
98 {
99   uint32_t signature; // First comes the signature, DSLX
100   uint32_t formatVersion; // File format version (CurrentFormatVersion)
101   uint32_t zipSupportVersion; // Zip support version -- narrows down reindexing
102                               // when it changes only for dictionaries with the
103                               // zip files
104   int dslEncoding; // Which encoding is used for the file indexed
105   uint32_t chunksOffset; // The offset to chunks' storage
106   uint32_t hasAbrv; // Non-zero means file has abrvs at abrvAddress
107   uint32_t abrvAddress; // Address of abrv map in the chunked storage
108   uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
109   uint32_t indexRootOffset;
110   uint32_t articleCount; // Number of articles this dictionary has
111   uint32_t wordCount; // Number of headwords this dictionary has
112   uint32_t langFrom;  // Source language
113   uint32_t langTo;    // Target language
114   uint32_t hasZipFile; // Non-zero means there's a zip file with resources
115                        // present
116   uint32_t hasSoundDictionaryName;
117   uint32_t zipIndexBtreeMaxElements; // Two fields from IndexInfo of the zip
118                                      // resource index.
119   uint32_t zipIndexRootOffset;
120 }
121 #ifndef _MSC_VER
122 __attribute__((packed))
123 #endif
124 ;
125 
126 struct InsidedCard
127 {
128   uint32_t offset;
129   uint32_t size;
130   QVector< wstring > headwords;
InsidedCardDsl::__anonc7691c880111::InsidedCard131   InsidedCard( uint32_t _offset, uint32_t _size, QVector< wstring > const & words ) :
132   offset( _offset ), size( _size ), headwords( words )
133   {}
InsidedCardDsl::__anonc7691c880111::InsidedCard134   InsidedCard( InsidedCard const & e ) :
135   offset( e.offset ), size( e.size ), headwords( e.headwords )
136   {}
InsidedCardDsl::__anonc7691c880111::InsidedCard137   InsidedCard() {}
138 
139 };
140 
indexIsOldOrBad(string const & indexFile,bool hasZipFile)141 bool indexIsOldOrBad( string const & indexFile, bool hasZipFile )
142 {
143   File::Class idx( indexFile, "rb" );
144 
145   IdxHeader header;
146 
147   return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
148          header.signature != Signature ||
149          header.formatVersion != CurrentFormatVersion ||
150          (bool) header.hasZipFile != hasZipFile ||
151          ( hasZipFile && header.zipSupportVersion != CurrentZipSupportVersion );
152 }
153 
154 class DslDictionary: public BtreeIndexing::BtreeDictionary
155 {
156   Mutex idxMutex;
157   File::Class idx;
158   IdxHeader idxHeader;
159   sptr< ChunkedStorage::Reader > chunks;
160   string dictionaryName;
161   string preferredSoundDictionary;
162   map< string, string > abrv;
163   Mutex dzMutex;
164   dictData * dz;
165   Mutex resourceZipMutex;
166   IndexedZip resourceZip;
167   BtreeIndex resourceZipIndex;
168 
169   QAtomicInt deferredInitDone;
170   Mutex deferredInitMutex;
171   bool deferredInitRunnableStarted;
172   QSemaphore deferredInitRunnableExited;
173 
174   string initError;
175 
176   int optionalPartNom;
177   quint8 articleNom;
178   int maxPictureWidth;
179 
180   wstring currentHeadword;
181 
182 public:
183 
184   DslDictionary( string const & id, string const & indexFile,
185                  vector< string > const & dictionaryFiles,
186                  int maxPictureWidth_ );
187 
188   virtual void deferredInit();
189 
190   ~DslDictionary();
191 
getName()192   virtual string getName() throw()
193   { return dictionaryName; }
194 
getProperties()195   virtual map< Dictionary::Property, string > getProperties() throw()
196   { return map< Dictionary::Property, string >(); }
197 
getArticleCount()198   virtual unsigned long getArticleCount() throw()
199   { return idxHeader.articleCount; }
200 
getWordCount()201   virtual unsigned long getWordCount() throw()
202   { return idxHeader.wordCount; }
203 
getLangFrom() const204   inline virtual quint32 getLangFrom() const
205   { return idxHeader.langFrom; }
206 
getLangTo() const207   inline virtual quint32 getLangTo() const
208   { return idxHeader.langTo; }
209 
210   #if 0
211   virtual vector< wstring > findHeadwordsForSynonym( wstring const & )
212     THROW_SPEC( std::exception )
213   {
214     return vector< wstring >();
215   }
216   #endif
217 
218   virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
219                                                       vector< wstring > const & alts,
220                                                       wstring const &,
221                                                       bool ignoreDiacritics )
222     THROW_SPEC( std::exception );
223 
224   virtual sptr< Dictionary::DataRequest > getResource( string const & name )
225     THROW_SPEC( std::exception );
226 
227   virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
228                                                             int searchMode, bool matchCase,
229                                                             int distanceBetweenWords,
230                                                             int maxResults,
231                                                             bool ignoreWordsOrder,
232                                                             bool ignoreDiacritics );
233   virtual QString const& getDescription();
234 
235   virtual QString getMainFilename();
236 
237   virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
238 
239   virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration );
240 
setFTSParameters(Config::FullTextSearch const & fts)241   virtual void setFTSParameters( Config::FullTextSearch const & fts )
242   {
243     if( ensureInitDone().size() )
244       return;
245 
246     can_FTS = fts.enabled
247               && !fts.disabledTypes.contains( "DSL", Qt::CaseInsensitive )
248               && ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
249   }
250 
getFtsIndexVersion()251   virtual uint32_t getFtsIndexVersion()
252   { return CurrentFtsIndexVersion; }
253 
254 protected:
255 
256   virtual void loadIcon() throw();
257 
258 private:
259 
260   virtual string const & ensureInitDone();
261   void doDeferredInit();
262 
263   /// Loads the article. Does not process the DSL language.
264   void loadArticle( uint32_t address,
265                     wstring const & requestedHeadwordFolded,
266                     bool ignoreDiacritics,
267                     wstring & tildeValue,
268                     wstring & displayedHeadword,
269                     unsigned & headwordIndex,
270                     wstring & articleText );
271 
272   /// Converts DSL language to an Html.
273   string dslToHtml( wstring const &, wstring const & headword = wstring() );
274 
275   // Parts of dslToHtml()
276   string nodeToHtml( ArticleDom::Node const & );
277   string processNodeChildren( ArticleDom::Node const & node );
278 
hasHiddenZones()279   bool hasHiddenZones()           /// Return true if article has hidden zones
280   { return optionalPartNom != 0; }
281 
282   friend class DslArticleRequest;
283   friend class DslResourceRequest;
284   friend class DslFTSResultsRequest;
285   friend class DslDeferredInitRunnable;
286 };
287 
DslDictionary(string const & id,string const & indexFile,vector<string> const & dictionaryFiles,int maxPictureWidth_)288 DslDictionary::DslDictionary( string const & id,
289                               string const & indexFile,
290                               vector< string > const & dictionaryFiles,
291                               int maxPictureWidth_ ):
292   BtreeDictionary( id, dictionaryFiles ),
293   idx( indexFile, "rb" ),
294   idxHeader( idx.read< IdxHeader >() ),
295   dz( 0 ),
296   deferredInitRunnableStarted( false ),
297   optionalPartNom( 0 ),
298   articleNom( 0 ),
299   maxPictureWidth( maxPictureWidth_ )
300 {
301   can_FTS = true;
302 
303   ftsIdxName = indexFile + "_FTS";
304 
305   if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
306       && !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
307     FTS_index_completed.ref();
308 
309   // Read the dictionary name
310 
311   idx.seek( sizeof( idxHeader ) );
312 
313   vector< char > dName( idx.read< uint32_t >() );
314   if( dName.size() > 0 )
315   {
316     idx.read( &dName.front(), dName.size() );
317     dictionaryName = string( &dName.front(), dName.size() );
318   }
319 
320   vector< char > sName( idx.read< uint32_t >() );
321   if( sName.size() > 0 )
322   {
323     idx.read( &sName.front(), sName.size() );
324     preferredSoundDictionary = string( &sName.front(), sName.size() );
325   }
326 
327   // Everything else would be done in deferred init
328 }
329 
~DslDictionary()330 DslDictionary::~DslDictionary()
331 {
332   Mutex::Lock _( deferredInitMutex );
333 
334   // Wait for init runnable to complete if it was ever started
335   if ( deferredInitRunnableStarted )
336     deferredInitRunnableExited.acquire();
337 
338   if ( dz )
339     dict_data_close( dz );
340 }
341 
342 //////// DslDictionary::deferredInit()
343 
344 class DslDeferredInitRunnable: public QRunnable
345 {
346   DslDictionary & dictionary;
347   QSemaphore & hasExited;
348 
349 public:
350 
DslDeferredInitRunnable(DslDictionary & dictionary_,QSemaphore & hasExited_)351   DslDeferredInitRunnable( DslDictionary & dictionary_,
352                            QSemaphore & hasExited_ ):
353     dictionary( dictionary_ ), hasExited( hasExited_ )
354   {}
355 
~DslDeferredInitRunnable()356   ~DslDeferredInitRunnable()
357   {
358     hasExited.release();
359   }
360 
run()361   virtual void run()
362   {
363     dictionary.doDeferredInit();
364   }
365 };
366 
deferredInit()367 void DslDictionary::deferredInit()
368 {
369   if ( !Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
370   {
371     Mutex::Lock _( deferredInitMutex );
372 
373     if ( Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
374       return;
375 
376     if ( !deferredInitRunnableStarted )
377     {
378       QThreadPool::globalInstance()->start(
379         new DslDeferredInitRunnable( *this, deferredInitRunnableExited ),
380         -1000 );
381       deferredInitRunnableStarted = true;
382     }
383   }
384 }
385 
386 
ensureInitDone()387 string const & DslDictionary::ensureInitDone()
388 {
389   // Simple, really.
390   doDeferredInit();
391 
392   return initError;
393 }
394 
doDeferredInit()395 void DslDictionary::doDeferredInit()
396 {
397   if ( !Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
398   {
399     Mutex::Lock _( deferredInitMutex );
400 
401     if ( Qt4x5::AtomicInt::loadAcquire( deferredInitDone ) )
402       return;
403 
404     // Do deferred init
405 
406     try
407     {
408       // Don't lock index file - no one should be working with it until
409       // the init is complete.
410       //Mutex::Lock _( idxMutex );
411 
412       chunks = new ChunkedStorage::Reader( idx, idxHeader.chunksOffset );
413 
414       // Open the .dsl file
415 
416       DZ_ERRORS error;
417       dz = dict_data_open( getDictionaryFilenames()[ 0 ].c_str(), &error, 0 );
418 
419       if ( !dz )
420         throw exDictzipError( string( dz_error_str( error ) )
421                               + "(" + getDictionaryFilenames()[ 0 ] + ")" );
422 
423       // Read the abrv, if any
424 
425       if ( idxHeader.hasAbrv )
426       {
427         vector< char > chunk;
428 
429         char * abrvBlock = chunks->getBlock( idxHeader.abrvAddress, chunk );
430 
431         uint32_t total;
432         memcpy( &total, abrvBlock, sizeof( uint32_t ) );
433         abrvBlock += sizeof( uint32_t );
434 
435         GD_DPRINTF( "Loading %u abbrv\n", total );
436 
437         while( total-- )
438         {
439           uint32_t keySz;
440           memcpy( &keySz, abrvBlock, sizeof( uint32_t ) );
441           abrvBlock += sizeof( uint32_t );
442 
443           char * key = abrvBlock;
444 
445           abrvBlock += keySz;
446 
447           uint32_t valueSz;
448           memcpy( &valueSz, abrvBlock, sizeof( uint32_t ) );
449           abrvBlock += sizeof( uint32_t );
450 
451           abrv[ string( key, keySz ) ] = string( abrvBlock, valueSz );
452 
453           abrvBlock += valueSz;
454         }
455       }
456 
457       // Initialize the index
458 
459       openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
460                             idxHeader.indexRootOffset ),
461                  idx, idxMutex );
462 
463       // Open a resource zip file, if there's one
464 
465       if ( idxHeader.hasZipFile &&
466            ( idxHeader.zipIndexBtreeMaxElements ||
467              idxHeader.zipIndexRootOffset ) )
468       {
469         resourceZip.openIndex( IndexInfo( idxHeader.zipIndexBtreeMaxElements,
470                                           idxHeader.zipIndexRootOffset ),
471                                idx, idxMutex );
472 
473         QString zipName = QDir::fromNativeSeparators(
474             FsEncoding::decode( getDictionaryFilenames().back().c_str() ) );
475 
476         if ( zipName.endsWith( ".zip", Qt::CaseInsensitive ) ) // Sanity check
477           resourceZip.openZipFile( zipName );
478       }
479     }
480     catch( std::exception & e )
481     {
482       initError = e.what();
483     }
484     catch( ... )
485     {
486       initError = "Unknown error";
487     }
488 
489     deferredInitDone.ref();
490   }
491 }
492 
493 
loadIcon()494 void DslDictionary::loadIcon() throw()
495 {
496   if ( dictionaryIconLoaded )
497     return;
498 
499   QString fileName =
500     QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
501 
502   // Remove the extension
503   if ( fileName.endsWith( ".dsl.dz", Qt::CaseInsensitive ) )
504     fileName.chop( 6 );
505   else
506     fileName.chop( 3 );
507 
508   if ( !loadIconFromFile( fileName ) )
509   {
510     // Load failed -- use default icons
511     dictionaryIcon = QIcon(":/icons/icon32_dsl.png");
512     dictionaryNativeIcon = QIcon(":/icons/icon_dsl_native.png");
513   }
514 
515   dictionaryIconLoaded = true;
516 }
517 
518 /// Determines whether or not this char is treated as whitespace for dsl
519 /// parsing or not. We can't rely on any Unicode standards here, since the
520 /// only standard that matters here is the original Dsl compiler's insides.
521 /// Some dictionaries, for instance, are known to specifically use a non-
522 /// breakable space (0xa0) to indicate that a headword begins with a space,
523 /// so nbsp is not a whitespace character for Dsl compiler.
524 /// For now we have only space and tab, since those are most likely the only
525 /// ones recognized as spaces by that compiler.
isDslWs(wchar ch)526 bool isDslWs( wchar ch )
527 {
528   switch( ch )
529   {
530     case ' ':
531     case '\t':
532       return true;
533     default:
534       return false;
535   }
536 }
537 
loadArticle(uint32_t address,wstring const & requestedHeadwordFolded,bool ignoreDiacritics,wstring & tildeValue,wstring & displayedHeadword,unsigned & headwordIndex,wstring & articleText)538 void DslDictionary::loadArticle( uint32_t address,
539                                  wstring const & requestedHeadwordFolded,
540                                  bool ignoreDiacritics,
541                                  wstring & tildeValue,
542                                  wstring & displayedHeadword,
543                                  unsigned & headwordIndex,
544                                  wstring & articleText )
545 {
546   wstring articleData;
547 
548   {
549     vector< char > chunk;
550 
551     char * articleProps;
552 
553     {
554       Mutex::Lock _( idxMutex );
555 
556       articleProps = chunks->getBlock( address, chunk );
557     }
558 
559     uint32_t articleOffset, articleSize;
560 
561     memcpy( &articleOffset, articleProps, sizeof( articleOffset ) );
562     memcpy( &articleSize, articleProps + sizeof( articleOffset ),
563             sizeof( articleSize ) );
564 
565     GD_DPRINTF( "offset = %x\n", articleOffset );
566 
567 
568     char * articleBody;
569 
570     {
571       Mutex::Lock _( dzMutex );
572 
573       articleBody = dict_data_read_( dz, articleOffset, articleSize, 0, 0 );
574     }
575 
576     if ( !articleBody )
577     {
578 //      throw exCantReadFile( getDictionaryFilenames()[ 0 ] );
579       articleData = GD_NATIVE_TO_WS( L"\n\r\t" ) + gd::toWString( QString( "DICTZIP error: " ) + dict_error_str( dz ) );
580     }
581     else
582     {
583       try
584       {
585         articleData =
586           DslIconv::toWstring(
587             DslIconv::getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
588             articleBody, articleSize );
589         free( articleBody );
590 
591         // Strip DSL comments
592         bool b = false;
593         stripComments( articleData, b );
594       }
595       catch( ... )
596       {
597         free( articleBody );
598         throw;
599       }
600     }
601   }
602 
603   size_t pos = 0;
604   bool hadFirstHeadword = false;
605   bool foundDisplayedHeadword = false;
606 
607   // Check is we retrieve insided card
608   bool insidedCard = isDslWs( articleData.at( 0 ) );
609 
610   wstring tildeValueWithUnsorted; // This one has unsorted parts left
611   for( headwordIndex = 0; ; )
612   {
613     size_t begin = pos;
614 
615     pos = articleData.find_first_of( GD_NATIVE_TO_WS( L"\n\r" ), begin );
616 
617     if ( pos == wstring::npos )
618       pos = articleData.size();
619 
620     if ( !foundDisplayedHeadword )
621     {
622       // Process the headword
623 
624       wstring rawHeadword = wstring( articleData, begin, pos - begin );
625 
626       if( insidedCard && !rawHeadword.empty() && isDslWs( rawHeadword[ 0 ] ) )
627       {
628         // Headword of the insided card
629         wstring::size_type hpos = rawHeadword.find( L'@' );
630         if( hpos != string::npos )
631         {
632           wstring head = Folding::trimWhitespace( rawHeadword.substr( hpos + 1 ) );
633           hpos = head.find( L'~' );
634           while( hpos != string::npos )
635           {
636             if( hpos == 0 || head[ hpos ] != L'\\' )
637               break;
638             hpos = head.find( L'~', hpos + 1 );
639           }
640           if( hpos == string::npos )
641             rawHeadword = head;
642           else
643             rawHeadword.clear();
644         }
645       }
646 
647       if( !rawHeadword.empty() )
648       {
649         if ( !hadFirstHeadword )
650         {
651           // We need our tilde expansion value
652           tildeValue = rawHeadword;
653 
654           list< wstring > lst;
655 
656           expandOptionalParts( tildeValue, &lst );
657 
658           if ( lst.size() ) // Should always be
659             tildeValue = lst.front();
660 
661           tildeValueWithUnsorted = tildeValue;
662 
663           processUnsortedParts( tildeValue, false );
664         }
665         wstring str = rawHeadword;
666 
667         if ( hadFirstHeadword )
668           expandTildes( str, tildeValueWithUnsorted );
669 
670         processUnsortedParts( str, true );
671 
672         str = Folding::applySimpleCaseOnly( str );
673 
674         list< wstring > lst;
675         expandOptionalParts( str, &lst );
676 
677         // Does one of the results match the requested word? If so, we'd choose
678         // it as our headword.
679 
680         for( list< wstring >::iterator i = lst.begin(); i != lst.end(); ++i )
681         {
682           unescapeDsl( *i );
683           normalizeHeadword( *i );
684 
685           bool found;
686           if( ignoreDiacritics )
687             found = Folding::applyDiacriticsOnly( Folding::trimWhitespace( *i ) ) == Folding::applyDiacriticsOnly( requestedHeadwordFolded );
688           else
689             found = Folding::trimWhitespace( *i ) == requestedHeadwordFolded;
690 
691           if ( found )
692           {
693             // Found it. Now we should make a displayed headword for it.
694             if ( hadFirstHeadword )
695               expandTildes( rawHeadword, tildeValueWithUnsorted );
696 
697             processUnsortedParts( rawHeadword, false );
698 
699             displayedHeadword = rawHeadword;
700 
701             foundDisplayedHeadword = true;
702             break;
703           }
704         }
705 
706         if ( !foundDisplayedHeadword )
707         {
708           ++headwordIndex;
709           hadFirstHeadword = true;
710         }
711       }
712     }
713 
714 
715     if ( pos == articleData.size() )
716       break;
717 
718     // Skip \n\r
719 
720     if ( articleData[ pos ] == '\r' )
721       ++pos;
722 
723     if ( pos != articleData.size() )
724     {
725       if ( articleData[ pos ] == '\n' )
726         ++pos;
727     }
728 
729     if ( pos == articleData.size() )
730     {
731       // Ok, it's end of article
732       break;
733     }
734     if( isDslWs( articleData[ pos ] ) )
735     {
736      // Check for begin article text
737       if( insidedCard )
738       {
739         // Check for next insided headword
740         wstring::size_type hpos = articleData.find_first_of( GD_NATIVE_TO_WS( L"\n\r" ), pos );
741         if ( hpos == wstring::npos )
742           hpos = articleData.size();
743 
744         wstring str = wstring( articleData, pos, hpos - pos );
745 
746         hpos = str.find( L'@');
747         if( hpos == wstring::npos || str[ hpos - 1 ] == L'\\' || !isAtSignFirst( str ) )
748           break;
749       }
750       else
751         break;
752     }
753   }
754 
755   if ( !foundDisplayedHeadword )
756   {
757     // This is strange. Anyway, use tilde expansion value, it's better
758     // than nothing (or requestedHeadwordFolded for insided card.
759     if( insidedCard )
760       displayedHeadword = requestedHeadwordFolded;
761     else
762       displayedHeadword = tildeValue;
763   }
764 
765   if ( pos != articleData.size() )
766     articleText = wstring( articleData, pos );
767   else
768     articleText.clear();
769 }
770 
dslToHtml(wstring const & str,wstring const & headword)771 string DslDictionary::dslToHtml( wstring const & str, wstring const & headword )
772 {
773  // Normalize the string
774   wstring normalizedStr = gd::normalize( str );
775   currentHeadword = headword;
776 
777   ArticleDom dom( normalizedStr, getName(), headword );
778 
779   optionalPartNom = 0;
780 
781   string html = processNodeChildren( dom.root );
782 
783   return html;
784 }
785 
processNodeChildren(ArticleDom::Node const & node)786 string DslDictionary::processNodeChildren( ArticleDom::Node const & node )
787 {
788   string result;
789 
790   for( ArticleDom::Node::const_iterator i = node.begin(); i != node.end();
791        ++i )
792     result += nodeToHtml( *i );
793 
794   return result;
795 }
nodeToHtml(ArticleDom::Node const & node)796 string DslDictionary::nodeToHtml( ArticleDom::Node const & node )
797 {
798   string result;
799 
800   if ( !node.isTag )
801   {
802     result = Html::escape( Utf8::encode( node.text ) );
803 
804     // Handle all end-of-line
805 
806     string::size_type n;
807 
808     // Strip all '\r'
809     while( ( n = result.find( '\r' ) ) != string::npos )
810       result.erase( n, 1 );
811 
812     // Replace all '\n'
813     while( ( n = result.find( '\n' ) ) != string::npos )
814       result.replace( n, 1, "<p></p>" );
815 
816     return result;
817   }
818 
819   if ( node.tagName == GD_NATIVE_TO_WS( L"b" ) )
820     result += "<b class=\"dsl_b\">" + processNodeChildren( node ) + "</b>";
821   else
822   if ( node.tagName == GD_NATIVE_TO_WS( L"i" ) )
823     result += "<i class=\"dsl_i\">" + processNodeChildren( node ) + "</i>";
824   else
825   if ( node.tagName == GD_NATIVE_TO_WS( L"u" ) )
826   {
827     string nodeText = processNodeChildren( node );
828 
829     if ( nodeText.size() && isDslWs( nodeText[ 0 ] ) )
830       result.push_back( ' ' ); // Fix a common problem where in "foo[i] bar[/i]"
831                                // the space before "bar" gets underlined.
832 
833     result += "<span class=\"dsl_u\">" + nodeText + "</span>";
834   }
835   else
836   if ( node.tagName == GD_NATIVE_TO_WS( L"c" ) )
837   {
838     result += "<font color=\"" + ( node.tagAttrs.size() ?
839       Html::escape( Utf8::encode( node.tagAttrs ) ) : string( "c_default_color" ) )
840       + "\">" + processNodeChildren( node ) + "</font>";
841   }
842   else
843   if ( node.tagName == GD_NATIVE_TO_WS( L"*" ) )
844   {
845       string id = "O" + getId().substr( 0, 7 ) + "_" +
846                 QString::number( articleNom ).toStdString() +
847                 "_opt_" + QString::number( optionalPartNom++ ).toStdString();
848     result += "<span class=\"dsl_opt\" id=\"" + id + "\">" + processNodeChildren( node ) + "</span>";
849   }
850   else
851   if ( node.tagName == GD_NATIVE_TO_WS( L"m" ) )
852       result += "<div class=\"dsl_m\">" + processNodeChildren( node ) + "</div>";
853   else
854   if ( node.tagName.size() == 2 && node.tagName[ 0 ] == L'm' &&
855        iswdigit( node.tagName[ 1 ] ) )
856     result += "<div class=\"dsl_" + Utf8::encode( node.tagName ) + "\">" + processNodeChildren( node ) + "</div>";
857   else
858   if ( node.tagName == GD_NATIVE_TO_WS( L"trn" ) )
859     result += "<span class=\"dsl_trn\">" + processNodeChildren( node ) + "</span>";
860   else
861   if ( node.tagName == GD_NATIVE_TO_WS( L"ex" ) )
862     result += "<span class=\"dsl_ex\">" + processNodeChildren( node ) + "</span>";
863   else
864   if ( node.tagName == GD_NATIVE_TO_WS( L"com" ) )
865     result += "<span class=\"dsl_com\">" + processNodeChildren( node ) + "</span>";
866   else
867   if ( node.tagName == GD_NATIVE_TO_WS( L"s" ) || node.tagName == GD_NATIVE_TO_WS( L"video" ) )
868   {
869     string filename = Filetype::simplifyString( Utf8::encode( node.renderAsText() ), false );
870     string n =
871       getDictionaryFilenames()[ 0 ] + ".files" +
872       FsEncoding::separator() +
873       FsEncoding::encode( filename );
874 
875     if ( Filetype::isNameOfSound( filename ) )
876     {
877       // If we have the file here, do the exact reference to this dictionary.
878       // Otherwise, make a global 'search' one.
879 
880       bool search =
881           !File::exists( n ) && !File::exists( FsEncoding::dirname( getDictionaryFilenames()[ 0 ] ) +
882                                                FsEncoding::separator() +
883                                                FsEncoding::encode( filename ) ) &&
884           ( !resourceZip.isOpen() ||
885             !resourceZip.hasFile( Utf8::decode( filename ) ) );
886 
887       QUrl url;
888       url.setScheme( "gdau" );
889       url.setHost( QString::fromUtf8( search ? "search" : getId().c_str() ) );
890       url.setPath( Qt4x5::Url::ensureLeadingSlash( QString::fromUtf8( filename.c_str() ) ) );
891       if( search && idxHeader.hasSoundDictionaryName )
892         Qt4x5::Url::setFragment( url, QString::fromUtf8( preferredSoundDictionary.c_str() ) );
893 
894       string ref = string( "\"" ) + url.toEncoded().data() + "\"";
895 
896       result += addAudioLink( ref, getId() );
897 
898       result += "<span class=\"dsl_s_wav\"><a href=" + ref
899          + "><img src=\"qrcx://localhost/icons/playsound.png\" border=\"0\" align=\"absmiddle\" alt=\"Play\"/></a></span>";
900     }
901     else
902     if ( Filetype::isNameOfPicture( filename ) )
903     {
904       QUrl url;
905       url.setScheme( "bres" );
906       url.setHost( QString::fromUtf8( getId().c_str() ) );
907       url.setPath( Qt4x5::Url::ensureLeadingSlash( QString::fromUtf8( filename.c_str() ) ) );
908 
909       vector< char > imgdata;
910       bool resize = false;
911 
912       try
913       {
914         File::loadFromFile( n, imgdata );
915       }
916       catch( File::exCantOpen & )
917       {
918         try
919         {
920           n = FsEncoding::dirname( getDictionaryFilenames()[ 0 ] ) +
921               FsEncoding::separator() +
922               FsEncoding::encode( filename );
923           File::loadFromFile( n, imgdata );
924         }
925         catch( File::exCantOpen & )
926         {
927           // Try reading from zip file
928           if ( resourceZip.isOpen() )
929           {
930             Mutex::Lock _( resourceZipMutex );
931             resourceZip.loadFile( Utf8::decode( filename ), imgdata );
932           }
933         }
934       }
935       catch(...)
936       {
937       }
938 
939       if( !imgdata.empty() )
940       {
941         if( Filetype::isNameOfSvg( filename ) )
942         {
943           // We don't need to render svg file now
944 
945           QSvgRenderer svg;
946           svg.load( QByteArray::fromRawData( imgdata.data(), imgdata.size() ) );
947           if( svg.isValid() )
948           {
949             QSize imgsize = svg.defaultSize();
950             resize = maxPictureWidth > 0
951                      && imgsize.width() > maxPictureWidth;
952           }
953         }
954         else
955         {
956           QImage img = QImage::fromData( (unsigned char *) &imgdata.front(),
957                                          imgdata.size() );
958 
959 #ifdef MAKE_EXTRA_TIFF_HANDLER
960           if( img.isNull() && Filetype::isNameOfTiff( filename ) )
961             GdTiff::tiffToQImage( &imgdata.front(), imgdata.size(), img );
962 #endif
963 
964           resize = maxPictureWidth > 0
965                    && img.width() > maxPictureWidth;
966         }
967       }
968 
969       if( resize )
970       {
971         string link( url.toEncoded().data() );
972         link.replace( 0, 4, "gdpicture" );
973         result += string( "<a href=\"" ) + link + "\">"
974                           + "<img src=\"" + url.toEncoded().data()
975                           + "\" alt=\"" + Html::escape( filename ) + "\""
976                           + "width=\"" + QString::number( maxPictureWidth).toStdString() + "\"/>"
977                           + "</a>";
978       }
979       else
980         result += string( "<img src=\"" ) + url.toEncoded().data()
981                   + "\" alt=\"" + Html::escape( filename ) + "\"/>";
982     }
983     else
984     if ( Filetype::isNameOfVideo( filename ) ) {
985       QUrl url;
986       url.setScheme( "gdvideo" );
987       url.setHost( QString::fromUtf8( getId().c_str() ) );
988       url.setPath( Qt4x5::Url::ensureLeadingSlash( QString::fromUtf8( filename.c_str() ) ) );
989 
990       result += string( "<a class=\"dsl_s dsl_video\" href=\"" ) + url.toEncoded().data() + "\">"
991              + "<span class=\"img\"></span>"
992              + "<span class=\"filename\">" + processNodeChildren( node ) + "</span>" + "</a>";
993     }
994     else
995     {
996       // Unknown file type, downgrade to a hyperlink
997 
998       QUrl url;
999       url.setScheme( "bres" );
1000       url.setHost( QString::fromUtf8( getId().c_str() ) );
1001       url.setPath( Qt4x5::Url::ensureLeadingSlash( QString::fromUtf8( filename.c_str() ) ) );
1002 
1003       result += string( "<a class=\"dsl_s\" href=\"" ) + url.toEncoded().data()
1004              + "\">" + processNodeChildren( node ) + "</a>";
1005     }
1006   }
1007   else
1008   if ( node.tagName == GD_NATIVE_TO_WS( L"url" ) )
1009   {
1010     string link = Html::escape( Filetype::simplifyString( Utf8::encode( node.renderAsText() ), false ) );
1011     if( QUrl::fromEncoded( link.c_str() ).scheme().isEmpty() )
1012       link = "http://" + link;
1013 
1014     QUrl url( QString::fromUtf8( link.c_str() ) );
1015     if( url.isLocalFile() && url.host().isEmpty() )
1016     {
1017       // Convert relative links to local files to absolute ones
1018       QString name = QFileInfo( getMainFilename() ).absolutePath();
1019       name += url.toLocalFile();
1020       QFileInfo info( name );
1021       if( info.isFile() )
1022       {
1023         name = info.canonicalFilePath();
1024         url.setPath( Qt4x5::Url::ensureLeadingSlash( QUrl::fromLocalFile( name ).path() ) );
1025         link = string( url.toEncoded().data() );
1026       }
1027     }
1028 
1029     result += "<a class=\"dsl_url\" href=\"" + link +"\">" + processNodeChildren( node ) + "</a>";
1030   }
1031   else
1032   if ( node.tagName == GD_NATIVE_TO_WS( L"!trs" ) )
1033   {
1034     result += "<span class=\"dsl_trs\">" + processNodeChildren( node ) + "</span>";
1035   }
1036   else
1037   if ( node.tagName == GD_NATIVE_TO_WS( L"p") )
1038   {
1039     result += "<span class=\"dsl_p\"";
1040 
1041     string val = Utf8::encode( node.renderAsText() );
1042 
1043     // If we have such a key, display a title
1044 
1045     map< string, string >::const_iterator i = abrv.find( val );
1046 
1047     if ( i != abrv.end() )
1048     {
1049       string title;
1050 
1051       if ( Utf8::decode( i->second ).size() < 70 )
1052       {
1053         // Replace all spaces with non-breakable ones, since that's how
1054         // Lingvo shows tooltips
1055         title.reserve( i->second.size() );
1056 
1057         for( char const * c = i->second.c_str(); *c; ++c )
1058         {
1059           if ( *c == ' ' || *c == '\t' )
1060           {
1061             // u00A0 in utf8
1062             title.push_back( 0xC2 );
1063             title.push_back( 0xA0 );
1064           }
1065           else
1066           if( *c == '-' ) // Change minus to non-breaking hyphen (uE28091 in utf8)
1067           {
1068             title.push_back( 0xE2 );
1069             title.push_back( 0x80 );
1070             title.push_back( 0x91 );
1071           }
1072           else
1073             title.push_back( *c );
1074         }
1075       }
1076       else
1077         title = i->second;
1078 
1079       result += " title=\"" + Html::escape( title ) + "\"";
1080     }
1081 
1082     result += ">" + processNodeChildren( node ) + "</span>";
1083   }
1084   else
1085   if ( node.tagName == GD_NATIVE_TO_WS( L"'" ) )
1086   {
1087     // There are two ways to display the stress: by adding an accent sign or via font styles.
1088     // We generate two spans, one with accented data and another one without it, so the
1089     // user could pick up the best suitable option.
1090     string data = processNodeChildren( node );
1091     result += "<span class=\"dsl_stress\"><span class=\"dsl_stress_without_accent\">" + data + "</span>"
1092         + "<span class=\"dsl_stress_with_accent\">" + data + Utf8::encode( wstring( 1, 0x301 ) )
1093         + "</span></span>";
1094   }
1095   else
1096   if ( node.tagName == GD_NATIVE_TO_WS( L"lang" ) )
1097   {
1098     result += "<span class=\"dsl_lang\"";
1099     if( !node.tagAttrs.empty() )
1100     {
1101       // Find ISO 639-1 code
1102       string langcode;
1103       QString attr = gd::toQString( node.tagAttrs );
1104       int n = attr.indexOf( "id=" );
1105       if( n >= 0 )
1106       {
1107         int id = attr.mid( n + 3 ).toInt();
1108         if( id )
1109           langcode = findCodeForDslId( id );
1110       }
1111       else
1112       {
1113         n = attr.indexOf( "name=\"" );
1114         if( n >= 0 )
1115         {
1116           int n2 = attr.indexOf( '\"', n + 6 );
1117           if( n2 > 0 )
1118           {
1119             quint32 id = dslLanguageToId( gd::toWString( attr.mid( n + 6, n2 - n - 6 ) ) );
1120             langcode = LangCoder::intToCode2( id ).toStdString();
1121           }
1122         }
1123       }
1124       if( !langcode.empty() )
1125         result += " lang=\"" + langcode + "\"";
1126     }
1127     result += ">" + processNodeChildren( node ) + "</span>";
1128   }
1129   else
1130   if ( node.tagName == GD_NATIVE_TO_WS( L"ref" ) )
1131   {
1132     QUrl url;
1133 
1134     url.setScheme( "gdlookup" );
1135     url.setHost( "localhost" );
1136     url.setPath( Qt4x5::Url::ensureLeadingSlash( gd::toQString( node.renderAsText() ) ) );
1137     if( !node.tagAttrs.empty() )
1138     {
1139       QString attr = gd::toQString( node.tagAttrs ).remove( '\"' );
1140       int n = attr.indexOf( '=' );
1141       if( n > 0 )
1142       {
1143         QList< QPair< QString, QString > > query;
1144         query.append( QPair< QString, QString >( attr.left( n ), attr.mid( n + 1 ) ) );
1145         Qt4x5::Url::setQueryItems( url, query );
1146       }
1147     }
1148 
1149     result += string( "<a class=\"dsl_ref\" href=\"" ) + url.toEncoded().data() +"\">"
1150               + processNodeChildren( node ) + "</a>";
1151   }
1152   else
1153   if ( node.tagName == GD_NATIVE_TO_WS( L"@" ) )
1154   {
1155     // Special case - insided card header was not parsed
1156 
1157     QUrl url;
1158 
1159     url.setScheme( "gdlookup" );
1160     url.setHost( "localhost" );
1161     wstring nodeStr = node.renderAsText();
1162     normalizeHeadword( nodeStr );
1163     url.setPath( Qt4x5::Url::ensureLeadingSlash( gd::toQString( nodeStr ) ) );
1164 
1165     result += string( "<a class=\"dsl_ref\" href=\"" ) + url.toEncoded().data() +"\">"
1166               + processNodeChildren( node ) + "</a>";
1167   }
1168   else
1169   if ( node.tagName == GD_NATIVE_TO_WS( L"sub" ) )
1170   {
1171     result += "<sub>" + processNodeChildren( node ) + "</sub>";
1172   }
1173   else
1174   if ( node.tagName == GD_NATIVE_TO_WS( L"sup" ) )
1175   {
1176     result += "<sup>" + processNodeChildren( node ) + "</sup>";
1177   }
1178   else
1179   if ( node.tagName == GD_NATIVE_TO_WS( L"t" ) )
1180   {
1181     result += "<span class=\"dsl_t\">" + processNodeChildren( node ) + "</span>";
1182   }
1183   else
1184   if ( node.tagName == GD_NATIVE_TO_WS( L"br" ) )
1185   {
1186     result += "<br />";
1187   }
1188   else
1189   {
1190     gdWarning( "DSL: Unknown tag \"%s\" with attributes \"%s\" found in \"%s\", article \"%s\".",
1191                gd::toQString( node.tagName ).toUtf8().data(), gd::toQString( node.tagAttrs ).toUtf8().data(),
1192                getName().c_str(), gd::toQString( currentHeadword ).toUtf8().data() );
1193 
1194     result += "<span class=\"dsl_unknown\">[" + string( gd::toQString( node.tagName ).toUtf8().data() );
1195     if( !node.tagAttrs.empty() )
1196       result += " " + string( gd::toQString( node.tagAttrs ).toUtf8().data() );
1197     result += "]" + processNodeChildren( node ) + "</span>";
1198   }
1199 
1200   return result;
1201 }
1202 
getDescription()1203 QString const& DslDictionary::getDescription()
1204 {
1205     if( !dictionaryDescription.isEmpty() )
1206         return dictionaryDescription;
1207 
1208     dictionaryDescription = "NONE";
1209 
1210     QString fileName =
1211       QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
1212 
1213     // Remove the extension
1214     if ( fileName.endsWith( ".dsl.dz", Qt::CaseInsensitive ) )
1215       fileName.chop( 6 );
1216     else
1217       fileName.chop( 3 );
1218 
1219     fileName += "ann";
1220     QFileInfo info( fileName );
1221 
1222     if ( info.exists() )
1223     {
1224         QFile annFile( fileName );
1225         if( !annFile.open( QFile::ReadOnly | QFile::Text ) )
1226             return dictionaryDescription;
1227 
1228         QTextStream annStream( &annFile );
1229         QString data, str;
1230 
1231         str = annStream.readLine();
1232 
1233         if( str.left( 10 ).compare( "#LANGUAGE " ) != 0 )
1234         {
1235             annStream.seek( 0 );
1236             dictionaryDescription = annStream.readAll();
1237         }
1238         else
1239         {
1240             // Multilanguage annotation
1241 
1242             qint32 gdLang, annLang;
1243             QString langStr;
1244             gdLang = LangCoder::code2toInt( QLocale::system().name().left( 2 ).toLatin1().data() );
1245             for(;;)
1246             {
1247                 data.clear();
1248                 langStr = str.mid( 10 ).replace( '\"', ' ').trimmed();
1249                 annLang = LangCoder::findIdForLanguage( gd::toWString( langStr ) );
1250                 do
1251                 {
1252                     str = annStream.readLine();
1253                     if( str.left( 10 ).compare( "#LANGUAGE " ) == 0 )
1254                         break;
1255                     if( !str.endsWith( '\n' ) )
1256                         str.append( '\n' );
1257                     data += str;
1258                 }
1259                 while ( !annStream.atEnd() );
1260                 if( dictionaryDescription.compare( "NONE ") == 0 || langStr.compare( "English", Qt::CaseInsensitive ) == 0 || gdLang == annLang )
1261                     dictionaryDescription = data.trimmed();
1262                 if( gdLang == annLang || annStream.atEnd() )
1263                     break;
1264             }
1265         }
1266     }
1267     return dictionaryDescription;
1268 }
1269 
getMainFilename()1270 QString DslDictionary::getMainFilename()
1271 {
1272   return FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() );
1273 }
1274 
makeFTSIndex(QAtomicInt & isCancelled,bool firstIteration)1275 void DslDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
1276 {
1277   if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
1278          || FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
1279     FTS_index_completed.ref();
1280 
1281 
1282   if( haveFTSIndex() )
1283     return;
1284 
1285   if( ensureInitDone().size() )
1286     return;
1287 
1288   if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
1289     return;
1290 
1291   gdDebug( "Dsl: Building the full-text index for dictionary: %s\n",
1292            getName().c_str() );
1293 
1294   try
1295   {
1296     FtsHelpers::makeFTSIndex( this, isCancelled );
1297     FTS_index_completed.ref();
1298   }
1299   catch( std::exception &ex )
1300   {
1301     gdWarning( "DSL: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
1302     QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
1303   }
1304 }
1305 
getArticleText(uint32_t articleAddress,QString & headword,QString & text)1306 void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
1307 {
1308   headword.clear();
1309   text.clear();
1310 
1311   vector< char > chunk;
1312 
1313   char * articleProps;
1314   wstring articleData;
1315 
1316   {
1317     Mutex::Lock _( idxMutex );
1318     articleProps = chunks->getBlock( articleAddress, chunk );
1319   }
1320 
1321   uint32_t articleOffset, articleSize;
1322 
1323   memcpy( &articleOffset, articleProps, sizeof( articleOffset ) );
1324   memcpy( &articleSize, articleProps + sizeof( articleOffset ),
1325           sizeof( articleSize ) );
1326 
1327   char * articleBody;
1328 
1329   {
1330     Mutex::Lock _( dzMutex );
1331     articleBody = dict_data_read_( dz, articleOffset, articleSize, 0, 0 );
1332   }
1333 
1334   if ( !articleBody )
1335   {
1336     return;
1337   }
1338   else
1339   {
1340     try
1341     {
1342       articleData =
1343         DslIconv::toWstring(
1344           DslIconv::getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
1345           articleBody, articleSize );
1346       free( articleBody );
1347 
1348       // Strip DSL comments
1349       bool b = false;
1350       stripComments( articleData, b );
1351     }
1352     catch( ... )
1353     {
1354       free( articleBody );
1355       return;
1356     }
1357   }
1358 
1359   // Skip headword
1360 
1361   size_t pos = 0;
1362   wstring articleHeadword, tildeValue;
1363 
1364   // Check if we retrieve insided card
1365   bool insidedCard = isDslWs( articleData.at( 0 ) );
1366 
1367   for( ; ; )
1368   {
1369     size_t begin = pos;
1370 
1371     pos = articleData.find_first_of( GD_NATIVE_TO_WS( L"\n\r" ), begin );
1372 
1373     if ( articleHeadword.empty() )
1374     {
1375       // Process the headword
1376 
1377       articleHeadword = wstring( articleData, begin, pos - begin );
1378 
1379       if( insidedCard && !articleHeadword.empty() && isDslWs( articleHeadword[ 0 ] ) )
1380       {
1381         // Headword of the insided card
1382         wstring::size_type hpos = articleHeadword.find( L'@' );
1383         if( hpos != string::npos )
1384         {
1385           wstring head = Folding::trimWhitespace( articleHeadword.substr( hpos + 1 ) );
1386           hpos = head.find( L'~' );
1387           while( hpos != string::npos )
1388           {
1389             if( hpos == 0 || head[ hpos ] != L'\\' )
1390               break;
1391             hpos = head.find( L'~', hpos + 1 );
1392           }
1393           if( hpos == string::npos )
1394             articleHeadword = head;
1395           else
1396             articleHeadword.clear();
1397         }
1398       }
1399 
1400       if( !articleHeadword.empty() )
1401       {
1402         list< wstring > lst;
1403 
1404         tildeValue = articleHeadword;
1405 
1406         processUnsortedParts( articleHeadword, true );
1407         expandOptionalParts( articleHeadword, &lst );
1408 
1409         if ( lst.size() ) // Should always be
1410           articleHeadword = lst.front();
1411       }
1412     }
1413 
1414     if ( pos == articleData.size() )
1415       break;
1416 
1417     // Skip \n\r
1418 
1419     if ( articleData[ pos ] == '\r' )
1420       ++pos;
1421 
1422     if ( pos != articleData.size() )
1423     {
1424       if ( articleData[ pos ] == '\n' )
1425         ++pos;
1426     }
1427 
1428     if ( pos == articleData.size() )
1429     {
1430       // Ok, it's end of article
1431       break;
1432     }
1433     if( isDslWs( articleData[ pos ] ) )
1434     {
1435      // Check for begin article text
1436       if( insidedCard )
1437       {
1438         // Check for next insided headword
1439         wstring::size_type hpos = articleData.find_first_of( GD_NATIVE_TO_WS( L"\n\r" ), pos );
1440         if ( hpos == wstring::npos )
1441           hpos = articleData.size();
1442 
1443         wstring str = wstring( articleData, pos, hpos - pos );
1444 
1445         hpos = str.find( L'@');
1446         if( hpos == wstring::npos || str[ hpos - 1 ] == L'\\' || !isAtSignFirst( str ) )
1447           break;
1448       }
1449       else
1450         break;
1451     }
1452   }
1453 
1454   if( !articleHeadword.empty() )
1455   {
1456     unescapeDsl( articleHeadword );
1457     normalizeHeadword( articleHeadword );
1458     headword = gd::toQString( articleHeadword );
1459   }
1460 
1461   wstring articleText;
1462 
1463   if ( pos != articleData.size() )
1464     articleText = wstring( articleData, pos );
1465   else
1466     articleText.clear();
1467 
1468   if( !tildeValue.empty() )
1469   {
1470     list< wstring > lst;
1471 
1472     processUnsortedParts( tildeValue, false );
1473     expandOptionalParts( tildeValue, &lst );
1474 
1475     if ( lst.size() ) // Should always be
1476       expandTildes( articleText, lst.front() );
1477   }
1478 
1479   if( !articleText.empty() )
1480   {
1481     text = gd::toQString( articleText ).normalized( QString::NormalizationForm_C );
1482 
1483     articleText.clear();
1484 
1485     // Parse article text
1486 
1487     // Strip some areas
1488 
1489     const int stripTagsNumber = 5;
1490     static QString stripTags[ stripTagsNumber ] =
1491                                                   {
1492                                                     "s",
1493                                                     "url",
1494                                                     "!trs",
1495                                                     "video",
1496                                                     "preview"
1497                                                   };
1498     static QString stripEndTags[ stripTagsNumber ] =
1499                                                   {
1500                                                     "[/s]",
1501                                                     "[/url]",
1502                                                     "[/!trs]",
1503                                                     "[/video]",
1504                                                     "[/preview]"
1505                                                   };
1506 
1507     int pos = 0;
1508     while( pos >= 0 )
1509     {
1510       pos = text.indexOf( '[', pos, Qt::CaseInsensitive );
1511       if( pos >= 0 )
1512       {
1513         if( ( pos > 0 && text[ pos - 1 ] == '\\' && ( pos < 2 || text[ pos - 2 ] != '\\' ) )
1514               || ( pos > text.size() - 2 || text[ pos + 1 ] == '/' ) )
1515         {
1516           pos += 1;
1517           continue;
1518         }
1519 
1520         int pos2 = text.indexOf( ']',  pos + 1, Qt::CaseInsensitive );
1521         if( pos2 < 0 )
1522           break;
1523 
1524         QString tag = text.mid( pos + 1, pos2 - pos - 1 );
1525 
1526         int n;
1527         for( n = 0; n < stripTagsNumber; n++ )
1528         {
1529           if( tag.compare( stripTags[ n ], Qt::CaseInsensitive ) == 0 )
1530           {
1531             pos2 = text.indexOf( stripEndTags[ n ] , pos + stripTags[ n ].size() + 2, Qt::CaseInsensitive );
1532             text.replace( pos, pos2 > 0 ? pos2 - pos + stripEndTags[ n ].length() : text.length() - pos, " " );
1533             break;
1534           }
1535         }
1536 
1537         if( n >= stripTagsNumber )
1538           pos += 1;
1539       }
1540     }
1541 
1542     // Strip tags
1543 
1544 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
1545     text.replace( QRegularExpression( "\\[(|/)(p|trn|ex|com|\\*|t|br|m[0-9]?)\\]" ), " " );
1546     text.replace( QRegularExpression( "\\[(|/)lang(\\s[^\\]]*)?\\]" ), " " );
1547     text.remove( QRegularExpression( "\\[[^\\\\\\[\\]]+\\]" ) );
1548 #else
1549     text.replace( QRegExp( "\\[(|/)(p|trn|ex|com|\\*|t|br|m[0-9]?)\\]" ), " " );
1550     text.replace( QRegExp( "\\[(|/)lang(\\s[^\\]]*)?\\]" ), " " );
1551     text.remove( QRegExp( "\\[[^\\\\\\[\\]]+\\]" ) );
1552 #endif
1553     text.remove( QString::fromLatin1( "<<" ) );
1554     text.remove( QString::fromLatin1( ">>" ) );
1555 
1556     // Chech for insided cards
1557 
1558     bool haveInsidedCards = false;
1559     pos = 0;
1560     while( pos >= 0 )
1561     {
1562       pos = text.indexOf( "@", pos );
1563       if( pos > 0 && text.at( pos - 1 ) != '\\' )
1564       {
1565         haveInsidedCards = true;
1566         break;
1567       }
1568 
1569       if( pos >= 0 )
1570         pos += 1;
1571     }
1572 
1573     if( haveInsidedCards )
1574     {
1575       // Use base DSL parser for articles with insided cards
1576       ArticleDom dom( gd::toWString( text ), getName(), articleHeadword );
1577       text = gd::toQString( dom.root.renderAsText( true ) );
1578     }
1579     else
1580     {
1581       // Unescape DSL symbols
1582       pos = 0;
1583       while( pos >= 0 )
1584       {
1585         pos = text.indexOf( '\\', pos );
1586         if( pos >= 0 )
1587         {
1588           if( text[ pos + 1 ] == '\\' )
1589             pos += 1;
1590 
1591           text.remove( pos, 1 );
1592         }
1593       }
1594     }
1595   }
1596 }
1597 
1598 /// DslDictionary::getArticle()
1599 
1600 class DslArticleRequest;
1601 
1602 class DslArticleRequestRunnable: public QRunnable
1603 {
1604   DslArticleRequest & r;
1605   QSemaphore & hasExited;
1606 
1607 public:
1608 
DslArticleRequestRunnable(DslArticleRequest & r_,QSemaphore & hasExited_)1609   DslArticleRequestRunnable( DslArticleRequest & r_,
1610                              QSemaphore & hasExited_ ): r( r_ ),
1611                                                         hasExited( hasExited_ )
1612   {}
1613 
~DslArticleRequestRunnable()1614   ~DslArticleRequestRunnable()
1615   {
1616     hasExited.release();
1617   }
1618 
1619   virtual void run();
1620 };
1621 
1622 class DslArticleRequest: public Dictionary::DataRequest
1623 {
1624   friend class DslArticleRequestRunnable;
1625 
1626   wstring word;
1627   vector< wstring > alts;
1628   DslDictionary & dict;
1629   bool ignoreDiacritics;
1630 
1631   QAtomicInt isCancelled;
1632   QSemaphore hasExited;
1633 
1634 public:
1635 
DslArticleRequest(wstring const & word_,vector<wstring> const & alts_,DslDictionary & dict_,bool ignoreDiacritics_)1636   DslArticleRequest( wstring const & word_,
1637                      vector< wstring > const & alts_,
1638                      DslDictionary & dict_, bool ignoreDiacritics_ ):
1639     word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ )
1640   {
1641     QThreadPool::globalInstance()->start(
1642       new DslArticleRequestRunnable( *this, hasExited ) );
1643   }
1644 
1645   void run(); // Run from another thread by DslArticleRequestRunnable
1646 
cancel()1647   virtual void cancel()
1648   {
1649     isCancelled.ref();
1650   }
1651 
~DslArticleRequest()1652   ~DslArticleRequest()
1653   {
1654     isCancelled.ref();
1655     hasExited.acquire();
1656   }
1657 };
1658 
run()1659 void DslArticleRequestRunnable::run()
1660 {
1661   r.run();
1662 }
1663 
run()1664 void DslArticleRequest::run()
1665 {
1666   if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1667   {
1668     finish();
1669     return;
1670   }
1671 
1672   if ( dict.ensureInitDone().size() )
1673   {
1674     setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
1675     finish();
1676     return;
1677   }
1678 
1679   vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
1680 
1681   for( unsigned x = 0; x < alts.size(); ++x )
1682   {
1683     /// Make an additional query for each alt
1684 
1685     vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
1686 
1687     chain.insert( chain.end(), altChain.begin(), altChain.end() );
1688   }
1689 
1690   // Some synonyms make it that the articles appear several times. We combat
1691   // this by only allowing them to appear once. Dsl treats different headwords
1692   // of the same article as different articles, so we also include headword
1693   // index here.
1694   set< pair< uint32_t, unsigned > > articlesIncluded;
1695 
1696   wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
1697 
1698   for( unsigned x = 0; x < chain.size(); ++x )
1699   {
1700     // Check if we're cancelled occasionally
1701     if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1702     {
1703       finish();
1704       return;
1705     }
1706 
1707     // Grab that article
1708 
1709     wstring tildeValue;
1710     wstring displayedHeadword;
1711     wstring articleBody;
1712     unsigned headwordIndex;
1713 
1714     string articleText, articleAfter;
1715 
1716     try
1717     {
1718       dict.loadArticle( chain[ x ].articleOffset, wordCaseFolded, ignoreDiacritics, tildeValue,
1719                         displayedHeadword, headwordIndex, articleBody );
1720 
1721       if ( !articlesIncluded.insert( std::make_pair( chain[ x ].articleOffset,
1722                                                      headwordIndex ) ).second )
1723         continue; // We already have this article in the body.
1724 
1725       dict.articleNom += 1;
1726 
1727       if( displayedHeadword.empty() || isDslWs( displayedHeadword[ 0 ] ) )
1728         displayedHeadword = word; // Special case - insided card
1729 
1730       articleText += "<div class=\"dsl_article\">";
1731       articleText += "<div class=\"dsl_headwords\"";
1732       if( dict.isFromLanguageRTL() )
1733         articleText += " dir=\"rtl\"";
1734       articleText += "><p>";
1735 
1736       if( displayedHeadword.size() == 1 && displayedHeadword[0] == '<' )  // Fix special case - "<" header
1737           articleText += "<";                                             // dslToHtml can't handle it correctly.
1738       else
1739         articleText += dict.dslToHtml( displayedHeadword, displayedHeadword );
1740 
1741       /// After this may be expand button will be inserted
1742 
1743       articleAfter += "</p></div>";
1744 
1745       expandTildes( articleBody, tildeValue );
1746 
1747       articleAfter += "<div class=\"dsl_definition\"";
1748       if( dict.isToLanguageRTL() )
1749         articleAfter += " dir=\"rtl\"";
1750       articleAfter += ">";
1751 
1752       articleAfter += dict.dslToHtml( articleBody, displayedHeadword );
1753       articleAfter += "</div>";
1754       articleAfter += "</div>";
1755 
1756       if( dict.hasHiddenZones() )
1757       {
1758         string prefix = "O" + dict.getId().substr( 0, 7 ) + "_" + QString::number( dict.articleNom ).toStdString();
1759         string id1 = prefix + "_expand";
1760         string id2 = prefix + "_opt_";
1761         string button = " <img src=\"qrcx://localhost/icons/expand_opt.png\" class=\"hidden_expand_opt\" id=\"" + id1 +
1762                         "\" onclick=\"gdExpandOptPart('" + id1 + "','" + id2 +"')\" alt=\"[+]\"/>";
1763         if( articleText.compare( articleText.size() - 4, 4, "</p>" ) == 0 )
1764           articleText.insert( articleText.size() - 4, " " + button );
1765         else
1766           articleText += button;
1767       }
1768 
1769       articleText += articleAfter;
1770     }
1771     catch( std::exception &ex )
1772     {
1773       gdWarning( "DSL: Failed loading article from \"%s\", reason: %s\n", dict.getName().c_str(), ex.what() );
1774       articleText = string( "<span class=\"dsl_article\">" )
1775                     + string( QObject::tr( "Article loading error" ).toUtf8().constData() )
1776                     + "</span>";
1777     }
1778 
1779     Mutex::Lock _( dataMutex );
1780 
1781     data.resize( data.size() + articleText.size() );
1782 
1783     memcpy( &data.front() + data.size() - articleText.size(),
1784             articleText.data(), articleText.size() );
1785 
1786     hasAnyData = true;
1787   }
1788 
1789   finish();
1790 }
1791 
getArticle(wstring const & word,vector<wstring> const & alts,wstring const &,bool ignoreDiacritics)1792 sptr< Dictionary::DataRequest > DslDictionary::getArticle( wstring const & word,
1793                                                            vector< wstring > const & alts,
1794                                                            wstring const &,
1795                                                            bool ignoreDiacritics )
1796   THROW_SPEC( std::exception )
1797 {
1798   return new DslArticleRequest( word, alts, *this, ignoreDiacritics );
1799 }
1800 
1801 //// DslDictionary::getResource()
1802 
1803 class DslResourceRequest;
1804 
1805 class DslResourceRequestRunnable: public QRunnable
1806 {
1807   DslResourceRequest & r;
1808   QSemaphore & hasExited;
1809 
1810 public:
1811 
DslResourceRequestRunnable(DslResourceRequest & r_,QSemaphore & hasExited_)1812   DslResourceRequestRunnable( DslResourceRequest & r_,
1813                               QSemaphore & hasExited_ ): r( r_ ),
1814                                                          hasExited( hasExited_ )
1815   {}
1816 
~DslResourceRequestRunnable()1817   ~DslResourceRequestRunnable()
1818   {
1819     hasExited.release();
1820   }
1821 
1822   virtual void run();
1823 };
1824 
1825 class DslResourceRequest: public Dictionary::DataRequest
1826 {
1827   friend class DslResourceRequestRunnable;
1828 
1829   DslDictionary & dict;
1830 
1831   string resourceName;
1832 
1833   QAtomicInt isCancelled;
1834   QSemaphore hasExited;
1835 
1836 public:
1837 
DslResourceRequest(DslDictionary & dict_,string const & resourceName_)1838   DslResourceRequest( DslDictionary & dict_,
1839                       string const & resourceName_ ):
1840     dict( dict_ ),
1841     resourceName( resourceName_ )
1842   {
1843     QThreadPool::globalInstance()->start(
1844       new DslResourceRequestRunnable( *this, hasExited ) );
1845   }
1846 
1847   void run(); // Run from another thread by DslResourceRequestRunnable
1848 
cancel()1849   virtual void cancel()
1850   {
1851     isCancelled.ref();
1852   }
1853 
~DslResourceRequest()1854   ~DslResourceRequest()
1855   {
1856     isCancelled.ref();
1857     hasExited.acquire();
1858   }
1859 };
1860 
run()1861 void DslResourceRequestRunnable::run()
1862 {
1863   r.run();
1864 }
1865 
run()1866 void DslResourceRequest::run()
1867 {
1868   // Some runnables linger enough that they are cancelled before they start
1869   if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1870   {
1871     finish();
1872     return;
1873   }
1874 
1875   if ( dict.ensureInitDone().size() )
1876   {
1877     setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
1878     finish();
1879     return;
1880   }
1881 
1882   string n =
1883     FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) +
1884     FsEncoding::separator() +
1885     FsEncoding::encode( resourceName );
1886 
1887   GD_DPRINTF( "n is %s\n", n.c_str() );
1888 
1889   try
1890   {
1891     try
1892     {
1893       Mutex::Lock _( dataMutex );
1894 
1895       File::loadFromFile( n, data );
1896     }
1897     catch( File::exCantOpen & )
1898     {
1899       n = dict.getDictionaryFilenames()[ 0 ] + ".files" +
1900           FsEncoding::separator() +
1901           FsEncoding::encode( resourceName );
1902 
1903       try
1904       {
1905         Mutex::Lock _( dataMutex );
1906 
1907         File::loadFromFile( n, data );
1908       }
1909       catch( File::exCantOpen & )
1910       {
1911         // Try reading from zip file
1912 
1913         if ( dict.resourceZip.isOpen() )
1914         {
1915           Mutex::Lock _( dict.resourceZipMutex );
1916 
1917           Mutex::Lock __( dataMutex );
1918 
1919           if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) )
1920             throw; // Make it fail since we couldn't read the archive
1921         }
1922         else
1923           throw;
1924       }
1925     }
1926 
1927     if ( Filetype::isNameOfTiff( resourceName ) )
1928     {
1929       // Convert it
1930 
1931       dataMutex.lock();
1932 
1933       QImage img = QImage::fromData( (unsigned char *) &data.front(),
1934                                      data.size() );
1935 
1936 #ifdef MAKE_EXTRA_TIFF_HANDLER
1937       if( img.isNull() )
1938         GdTiff::tiffToQImage( &data.front(), data.size(), img );
1939 #endif
1940 
1941       dataMutex.unlock();
1942 
1943       if ( !img.isNull() )
1944       {
1945         // Managed to load -- now store it back as BMP
1946 
1947         QByteArray ba;
1948         QBuffer buffer( &ba );
1949         buffer.open( QIODevice::WriteOnly );
1950         img.save( &buffer, "BMP" );
1951 
1952         Mutex::Lock _( dataMutex );
1953 
1954         data.resize( buffer.size() );
1955 
1956         memcpy( &data.front(), buffer.data(), data.size() );
1957       }
1958     }
1959 
1960     Mutex::Lock _( dataMutex );
1961 
1962     hasAnyData = true;
1963   }
1964   catch( std::exception &ex )
1965   {
1966     gdWarning( "DSL: Failed loading resource \"%s\" for \"%s\", reason: %s\n",
1967                resourceName.c_str(), dict.getName().c_str(), ex.what() );
1968     // Resource not loaded -- we don't set the hasAnyData flag then
1969   }
1970 
1971   finish();
1972 }
1973 
getResource(string const & name)1974 sptr< Dictionary::DataRequest > DslDictionary::getResource( string const & name )
1975   THROW_SPEC( std::exception )
1976 {
1977   return new DslResourceRequest( *this, name );
1978 }
1979 
1980 #if 0
1981 static void findCorrespondingFiles( string const & ifo,
1982                                     string & idx, string & dict, string & syn,
1983                                     bool needSyn )
1984 {
1985   string base( ifo, 0, ifo.size() - 3 );
1986 
1987   if ( !(
1988           tryPossibleName( base + "idx", idx ) ||
1989           tryPossibleName( base + "idx.gz", idx ) ||
1990           tryPossibleName( base + "idx.dz", idx ) ||
1991           tryPossibleName( base + "IDX", idx ) ||
1992           tryPossibleName( base + "IDX.GZ", idx ) ||
1993           tryPossibleName( base + "IDX.DZ", idx )
1994       ) )
1995     throw exNoIdxFile( ifo );
1996 
1997   if ( !(
1998           tryPossibleName( base + "dict", dict ) ||
1999           tryPossibleName( base + "dict.dz", dict ) ||
2000           tryPossibleName( base + "DICT", dict ) ||
2001           tryPossibleName( base + "dict.DZ", dict )
2002       ) )
2003     throw exNoDictFile( ifo );
2004 
2005   if ( needSyn && !(
2006                      tryPossibleName( base + "syn", syn ) ||
2007                      tryPossibleName( base + "syn.gz", syn ) ||
2008                      tryPossibleName( base + "syn.dz", syn ) ||
2009                      tryPossibleName( base + "SYN", syn ) ||
2010                      tryPossibleName( base + "SYN.GZ", syn ) ||
2011                      tryPossibleName( base + "SYN.DZ", syn )
2012      ) )
2013     throw exNoSynFile( ifo );
2014 }
2015 #endif
2016 
getSearchResults(QString const & searchString,int searchMode,bool matchCase,int distanceBetweenWords,int maxResults,bool ignoreWordsOrder,bool ignoreDiacritics)2017 sptr< Dictionary::DataRequest > DslDictionary::getSearchResults( QString const & searchString,
2018                                                                  int searchMode, bool matchCase,
2019                                                                  int distanceBetweenWords,
2020                                                                  int maxResults,
2021                                                                  bool ignoreWordsOrder,
2022                                                                  bool ignoreDiacritics )
2023 {
2024   return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics );
2025 }
2026 
2027 } // anonymous namespace
2028 
2029 /// makeDictionaries
2030 
makeDictionaries(vector<string> const & fileNames,string const & indicesDir,Dictionary::Initializing & initializing,int maxPictureWidth,unsigned int maxHeadwordSize)2031 vector< sptr< Dictionary::Class > > makeDictionaries(
2032                                       vector< string > const & fileNames,
2033                                       string const & indicesDir,
2034                                       Dictionary::Initializing & initializing,
2035                                       int maxPictureWidth, unsigned int maxHeadwordSize )
2036   THROW_SPEC( std::exception )
2037 {
2038   vector< sptr< Dictionary::Class > > dictionaries;
2039 
2040   for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
2041        ++i )
2042   {
2043     // Try .dsl and .dsl.dz suffixes
2044 
2045     bool uncompressedDsl = ( i->size() >= 4 &&
2046                              strcasecmp( i->c_str() + ( i->size() - 4 ), ".dsl" ) == 0 );
2047     if ( !uncompressedDsl &&
2048          ( i->size() < 7 ||
2049            strcasecmp( i->c_str() + ( i->size() - 7 ), ".dsl.dz" ) != 0 ) )
2050       continue;
2051 
2052     // Make sure it's not an abbreviation file
2053 
2054     int extSize = ( uncompressedDsl ? 4 : 7 );
2055     if ( i->size() - extSize >= 5 &&
2056          strncasecmp( i->c_str() + i->size() - extSize - 5, "_abrv", 5 ) == 0 )
2057     {
2058       // It is, skip it
2059       continue;
2060     }
2061 
2062     unsigned atLine = 0; // Indicates current line in .dsl, for debug purposes
2063 
2064     try
2065     {
2066       vector< string > dictFiles( 1, *i );
2067 
2068       // Check if there is an 'abrv' file present
2069       string baseName = ( (*i)[ i->size() - 4 ] == '.' ) ?
2070                string( *i, 0, i->size() - 4 ) : string( *i, 0, i->size() - 7 );
2071 
2072       string abrvFileName;
2073 
2074       if ( File::tryPossibleName( baseName + "_abrv.dsl", abrvFileName ) ||
2075            File::tryPossibleName( baseName + "_abrv.dsl.dz", abrvFileName ) ||
2076            File::tryPossibleName( baseName + "_ABRV.DSL", abrvFileName ) ||
2077            File::tryPossibleName( baseName + "_ABRV.DSL.DZ", abrvFileName ) ||
2078            File::tryPossibleName( baseName + "_ABRV.DSL.dz", abrvFileName ) )
2079         dictFiles.push_back( abrvFileName );
2080 
2081       string dictId = Dictionary::makeDictionaryId( dictFiles );
2082 
2083       // See if there's a zip file with resources present. If so, include it.
2084 
2085       string zipFileName;
2086 
2087       if ( File::tryPossibleZipName( baseName + ".dsl.files.zip", zipFileName ) ||
2088            File::tryPossibleZipName( baseName + ".dsl.dz.files.zip", zipFileName ) ||
2089            File::tryPossibleZipName( baseName + ".DSL.FILES.ZIP", zipFileName ) ||
2090            File::tryPossibleZipName( baseName + ".DSL.DZ.FILES.ZIP", zipFileName ) )
2091         dictFiles.push_back( zipFileName );
2092 
2093       string indexFile = indicesDir + dictId;
2094 
2095       if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
2096            indexIsOldOrBad( indexFile, zipFileName.size() ) )
2097       {
2098         DslScanner scanner( *i );
2099 
2100         try { // Here we intercept any errors during the read to save line at
2101               // which the incident happened. We need alive scanner for that.
2102 
2103         if ( scanner.getDictionaryName() == GD_NATIVE_TO_WS( L"Abbrev" ) )
2104           continue; // For now just skip abbreviations
2105 
2106         // Building the index
2107         initializing.indexingDictionary( Utf8::encode( scanner.getDictionaryName() ) );
2108 
2109         gdDebug( "Dsl: Building the index for dictionary: %s\n",
2110                  gd::toQString( scanner.getDictionaryName() ).toUtf8().data() );
2111 
2112         File::Class idx( indexFile, "wb" );
2113 
2114         IdxHeader idxHeader;
2115 
2116         memset( &idxHeader, 0, sizeof( idxHeader ) );
2117 
2118         // We write a dummy header first. At the end of the process the header
2119         // will be rewritten with the right values.
2120 
2121         idx.write( idxHeader );
2122 
2123         string dictionaryName = Utf8::encode( scanner.getDictionaryName() );
2124 
2125         idx.write( (uint32_t) dictionaryName.size() );
2126         idx.write( dictionaryName.data(), dictionaryName.size() );
2127 
2128         string soundDictName = Utf8::encode( scanner.getSoundDictionaryName() );
2129         if( !soundDictName.empty() )
2130         {
2131           idxHeader.hasSoundDictionaryName = 1;
2132           idx.write( (uint32_t) soundDictName.size() );
2133           idx.write( soundDictName.data(), soundDictName.size() );
2134         }
2135 
2136         idxHeader.dslEncoding = scanner.getEncoding();
2137 
2138         IndexedWords indexedWords;
2139 
2140         ChunkedStorage::Writer chunks( idx );
2141 
2142         // Read the abbreviations
2143 
2144         if ( abrvFileName.size() )
2145         {
2146           try
2147           {
2148             DslScanner abrvScanner( abrvFileName );
2149 
2150             map< string, string > abrv;
2151 
2152             wstring curString;
2153             size_t curOffset;
2154 
2155             for( ; ; )
2156             {
2157               // Skip any whitespace
2158               if ( !abrvScanner.readNextLineWithoutComments( curString, curOffset ) )
2159                 break;
2160               if ( curString.empty() || isDslWs( curString[ 0 ] ) )
2161                 continue;
2162 
2163               list< wstring > keys;
2164 
2165               bool eof = false;
2166 
2167               // Insert the key and read more, or get to the definition
2168               for( ; ; )
2169               {
2170                 processUnsortedParts( curString, true );
2171 
2172                 if ( keys.size() )
2173                   expandTildes( curString, keys.front() );
2174 
2175                 expandOptionalParts( curString, &keys );
2176 
2177                 if ( !abrvScanner.readNextLineWithoutComments( curString, curOffset ) || curString.empty() )
2178                 {
2179                   gdWarning( "Premature end of file %s\n", abrvFileName.c_str() );
2180                   eof = true;
2181                   break;
2182                 }
2183 
2184                 if ( isDslWs( curString[ 0 ] ) )
2185                   break;
2186               }
2187 
2188               if ( eof )
2189                 break;
2190 
2191               curString.erase( 0, curString.find_first_not_of( GD_NATIVE_TO_WS( L" \t" ) ) );
2192 
2193               if ( keys.size() )
2194                 expandTildes( curString, keys.front() );
2195 
2196               // If the string has any dsl markup, we strip it
2197               string value = Utf8::encode( ArticleDom( curString ).root.renderAsText() );
2198 
2199               for( list< wstring >::iterator i = keys.begin(); i != keys.end();
2200                    ++i )
2201               {
2202                 unescapeDsl( *i );
2203                 normalizeHeadword( *i );
2204 
2205                 abrv[ Utf8::encode( Folding::trimWhitespace( *i ) ) ] = value;
2206               }
2207             }
2208 
2209             idxHeader.hasAbrv = 1;
2210             idxHeader.abrvAddress = chunks.startNewBlock();
2211 
2212             uint32_t sz = abrv.size();
2213 
2214             chunks.addToBlock( &sz, sizeof( uint32_t ) );
2215 
2216             for( map< string, string >::const_iterator i = abrv.begin();
2217                  i != abrv.end(); ++i )
2218             {
2219 //              DPRINTF( "%s:%s\n", i->first.c_str(), i->second.c_str() );
2220 
2221               sz = i->first.size();
2222               chunks.addToBlock( &sz, sizeof( uint32_t ) );
2223               chunks.addToBlock( i->first.data(), sz );
2224               sz = i->second.size();
2225               chunks.addToBlock( &sz, sizeof( uint32_t ) );
2226               chunks.addToBlock( i->second.data(), sz );
2227             }
2228           }
2229           catch( std::exception & e )
2230           {
2231             gdWarning( "Error reading abrv file \"%s\", error: %s. Skipping it.\n",
2232                        abrvFileName.c_str(), e.what() );
2233           }
2234         }
2235 
2236         bool hasString = false;
2237         wstring curString;
2238         size_t curOffset;
2239 
2240         uint32_t articleCount = 0, wordCount = 0;
2241 
2242         for( ; ; )
2243         {
2244           // Find the main headword
2245 
2246           if ( !hasString && !scanner.readNextLineWithoutComments( curString, curOffset ) )
2247             break; // Clean end of file
2248 
2249           hasString = false;
2250 
2251           // The line read should either consist of pure whitespace, or be a
2252           // headword
2253 
2254           if ( curString.empty() )
2255             continue;
2256 
2257           if ( isDslWs( curString[ 0 ] ) )
2258           {
2259             // The first character is blank. Let's make sure that all other
2260             // characters are blank, too.
2261             for( size_t x = 1; x < curString.size(); ++x )
2262             {
2263               if ( !isDslWs( curString[ x ] ) )
2264               {
2265                 gdWarning( "Garbage string in %s at offset 0x%lX\n", i->c_str(), (unsigned long) curOffset );
2266                 break;
2267               }
2268             }
2269             continue;
2270           }
2271 
2272           // Ok, got the headword
2273 
2274           list< wstring > allEntryWords;
2275 
2276           processUnsortedParts( curString, true );
2277           expandOptionalParts( curString, &allEntryWords );
2278 
2279           uint32_t articleOffset = curOffset;
2280 
2281           //DPRINTF( "Headword: %ls\n", curString.c_str() );
2282 
2283           // More headwords may follow
2284 
2285           for( ; ; )
2286           {
2287             if ( ! ( hasString = scanner.readNextLineWithoutComments( curString, curOffset ) ) )
2288             {
2289               gdWarning( "Premature end of file %s\n", i->c_str() );
2290               break;
2291             }
2292 
2293             // Lingvo skips empty strings between the headwords
2294             if ( curString.empty() )
2295               continue;
2296 
2297             if ( isDslWs( curString[ 0 ] ) )
2298               break; // No more headwords
2299 
2300 #ifdef QT_DEBUG
2301             qDebug() << "Alt headword" << gd::toQString( curString );
2302 #endif
2303 
2304             processUnsortedParts( curString, true );
2305             expandTildes( curString, allEntryWords.front() );
2306             expandOptionalParts( curString, &allEntryWords );
2307           }
2308 
2309           if ( !hasString )
2310             break;
2311 
2312           // Insert new entry
2313 
2314           uint32_t descOffset = chunks.startNewBlock();
2315 
2316           chunks.addToBlock( &articleOffset, sizeof( articleOffset ) );
2317 
2318           for( list< wstring >::iterator j = allEntryWords.begin();
2319                j != allEntryWords.end(); ++j )
2320           {
2321             unescapeDsl( *j );
2322             normalizeHeadword( *j );
2323             indexedWords.addWord( *j, descOffset, maxHeadwordSize );
2324           }
2325 
2326           ++articleCount;
2327           wordCount += allEntryWords.size();
2328 
2329           int insideInsided = 0;
2330           wstring headword;
2331           QVector< InsidedCard > insidedCards;
2332           uint32_t offset = curOffset;
2333           QVector< wstring > insidedHeadwords;
2334           unsigned linesInsideCard = 0;
2335           int dogLine = 0;
2336           bool wasEmptyLine = false;
2337           int headwordLine = scanner.getLinesRead() - 2;
2338           bool noSignificantLines = Folding::applyWhitespaceOnly( curString ).empty();
2339           bool haveLine = !noSignificantLines;
2340 
2341           // Skip the article's body
2342           for( ; ; )
2343           {
2344             hasString = haveLine ? true : scanner.readNextLineWithoutComments( curString, curOffset );
2345             haveLine = false;
2346 
2347             if ( !hasString || ( curString.size() && !isDslWs( curString[ 0 ] ) ) )
2348             {
2349               if( insideInsided )
2350               {
2351                 gdWarning( "Unclosed tag '@' at line %i", dogLine );
2352                 insidedCards.append( InsidedCard( offset, curOffset - offset, insidedHeadwords ) );
2353               }
2354               if( noSignificantLines )
2355                 gdWarning( "Orphan headword at line %i", headwordLine );
2356 
2357               break;
2358             }
2359 
2360             // Check for orphan strings
2361 
2362             if( curString.empty() )
2363             {
2364               wasEmptyLine = true;
2365               continue;
2366             }
2367             else
2368             {
2369               if( wasEmptyLine && !Folding::applyWhitespaceOnly( curString ).empty() )
2370                 gdWarning( "Orphan string at line %i", scanner.getLinesRead() - 1 );
2371             }
2372 
2373             if( noSignificantLines )
2374               noSignificantLines = Folding::applyWhitespaceOnly( curString ).empty();
2375 
2376             // Find embedded cards
2377 
2378             wstring::size_type n = curString.find( L'@' );
2379             if( n == wstring::npos || curString[ n - 1 ] == L'\\' )
2380             {
2381               if( insideInsided )
2382                 linesInsideCard++;
2383 
2384               continue;
2385             }
2386             else
2387             {
2388               // Embedded card tag must be placed at first position in line after spaces
2389               if( !isAtSignFirst( curString ) )
2390               {
2391                 gdWarning( "Unescaped '@' symbol at line %i", scanner.getLinesRead() - 1 );
2392 
2393                 if( insideInsided )
2394                   linesInsideCard++;
2395 
2396                 continue;
2397               }
2398             }
2399 
2400             dogLine = scanner.getLinesRead() - 1;
2401 
2402             // Handle embedded card
2403 
2404             if( insideInsided )
2405             {
2406               if( linesInsideCard )
2407               {
2408                 insidedCards.append( InsidedCard( offset, curOffset - offset, insidedHeadwords ) );
2409 
2410                 insidedHeadwords.clear();
2411                 linesInsideCard = 0;
2412                 offset = curOffset;
2413               }
2414             }
2415             else
2416             {
2417               offset = curOffset;
2418               linesInsideCard = 0;
2419             }
2420 
2421             headword = Folding::trimWhitespace( curString.substr( n + 1 ) );
2422 
2423             if( !headword.empty() )
2424             {
2425               processUnsortedParts( headword, true );
2426               expandTildes( headword, allEntryWords.front() );
2427               insidedHeadwords.append( headword );
2428               insideInsided = true;
2429             }
2430             else
2431               insideInsided = false;
2432           }
2433 
2434           // Now that we're having read the first string after the article
2435           // itself, we can use its offset to calculate the article's size.
2436           // An end of file works here, too.
2437 
2438           uint32_t articleSize = ( curOffset - articleOffset );
2439 
2440           chunks.addToBlock( &articleSize, sizeof( articleSize ) );
2441 
2442           for( QVector< InsidedCard >::iterator i = insidedCards.begin(); i != insidedCards.end(); ++i )
2443           {
2444             uint32_t descOffset = chunks.startNewBlock();
2445             chunks.addToBlock( &(*i).offset, sizeof( (*i).offset ) );
2446             chunks.addToBlock( &(*i).size, sizeof( (*i).size ) );
2447 
2448             for( int x = 0; x < (*i).headwords.size(); x++ )
2449             {
2450               allEntryWords.clear();
2451               expandOptionalParts( (*i).headwords[ x ], &allEntryWords );
2452 
2453               for( list< wstring >::iterator j = allEntryWords.begin();
2454                    j != allEntryWords.end(); ++j )
2455               {
2456                 unescapeDsl( *j );
2457                 normalizeHeadword( *j );
2458                 indexedWords.addWord( *j, descOffset, maxHeadwordSize );
2459               }
2460 
2461               wordCount += allEntryWords.size();
2462             }
2463             ++articleCount;
2464           }
2465 
2466           if ( !hasString )
2467             break;
2468         }
2469 
2470         // Finish with the chunks
2471 
2472         idxHeader.chunksOffset = chunks.finish();
2473 
2474         // Build index
2475 
2476         IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
2477 
2478         idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
2479         idxHeader.indexRootOffset = idxInfo.rootOffset;
2480 
2481         indexedWords.clear(); // Release memory -- no need for this data
2482 
2483         // If there was a zip file, index it too
2484 
2485         if ( zipFileName.size() )
2486         {
2487           GD_DPRINTF( "Indexing zip file\n" );
2488 
2489           idxHeader.hasZipFile = 1;
2490 
2491           IndexedWords zipFileNames;
2492           IndexedZip zipFile;
2493           if( zipFile.openZipFile( QDir::fromNativeSeparators(
2494                                    FsEncoding::decode( zipFileName.c_str() ) ) ) )
2495               zipFile.indexFile( zipFileNames );
2496 
2497           if( !zipFileNames.empty() )
2498           {
2499             // Build the resulting zip file index
2500 
2501             IndexInfo idxInfo = BtreeIndexing::buildIndex( zipFileNames, idx );
2502 
2503             idxHeader.zipIndexBtreeMaxElements = idxInfo.btreeMaxElements;
2504             idxHeader.zipIndexRootOffset = idxInfo.rootOffset;
2505           }
2506           else
2507           {
2508             // Bad zip file -- no index (though the mark that we have one
2509             // remains)
2510             idxHeader.zipIndexBtreeMaxElements = 0;
2511             idxHeader.zipIndexRootOffset = 0;
2512           }
2513         }
2514         else
2515           idxHeader.hasZipFile = 0;
2516 
2517         // That concludes it. Update the header.
2518 
2519         idxHeader.signature = Signature;
2520         idxHeader.formatVersion = CurrentFormatVersion;
2521         idxHeader.zipSupportVersion = CurrentZipSupportVersion;
2522 
2523         idxHeader.articleCount = articleCount;
2524         idxHeader.wordCount = wordCount;
2525 
2526         idxHeader.langFrom = dslLanguageToId( scanner.getLangFrom() );
2527         idxHeader.langTo = dslLanguageToId( scanner.getLangTo() );
2528 
2529         idx.rewind();
2530 
2531         idx.write( &idxHeader, sizeof( idxHeader ) );
2532 
2533       } // In-place try for saving line count
2534       catch( ... )
2535       {
2536         atLine = scanner.getLinesRead();
2537         throw;
2538       }
2539 
2540       } // if need to rebuild
2541 
2542       dictionaries.push_back( new DslDictionary( dictId,
2543                                                  indexFile,
2544                                                  dictFiles,
2545                                                  maxPictureWidth ) );
2546     }
2547     catch( std::exception & e )
2548     {
2549       gdWarning( "DSL dictionary reading failed: %s:%u, error: %s\n",
2550                  i->c_str(), atLine, e.what() );
2551     }
2552   }
2553 
2554   return dictionaries;
2555 }
2556 
2557 
2558 }
2559