1 /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2  * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
3 
4 #include "stardict.hh"
5 #include "btreeidx.hh"
6 #include "folding.hh"
7 #include "utf8.hh"
8 #include "chunkedstorage.hh"
9 #include "dictzip.h"
10 #include "xdxf2html.hh"
11 #include "htmlescape.hh"
12 #include "langcoder.hh"
13 #include "gddebug.hh"
14 #include "fsencoding.hh"
15 #include "filetype.hh"
16 #include "indexedzip.hh"
17 #include "tiff.hh"
18 #include "ftshelpers.hh"
19 #include "wstring_qt.hh"
20 #include "audiolink.hh"
21 
22 #include <zlib.h>
23 #include <map>
24 #include <set>
25 #include <string>
26 #ifndef __WIN32
27 #include <arpa/inet.h>
28 #else
29 #include <winsock.h>
30 #endif
31 #include <stdlib.h>
32 
33 #ifdef _MSC_VER
34 #include <stub_msvc.h>
35 #endif
36 
37 #include <QString>
38 #include <QSemaphore>
39 #include <QThreadPool>
40 #include <QAtomicInt>
41 #include <QDebug>
42 #include <QRegExp>
43 #include <QStringList>
44 #include <QDomDocument>
45 #include <QDomNode>
46 #include "ufile.hh"
47 #include "qt4x5.hh"
48 
49 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
50 #include <QRegularExpression>
51 #endif
52 
53 namespace Stardict {
54 
55 using std::map;
56 using std::multimap;
57 using std::pair;
58 using std::set;
59 using std::string;
60 using gd::wstring;
61 
62 using BtreeIndexing::WordArticleLink;
63 using BtreeIndexing::IndexedWords;
64 using BtreeIndexing::IndexInfo;
65 
66 namespace {
67 
68 DEF_EX( exNotAnIfoFile, "Not an .ifo file", Dictionary::Ex )
69 DEF_EX_STR( exBadFieldInIfo, "Bad field in .ifo file encountered:", Dictionary::Ex )
70 DEF_EX_STR( exNoIdxFile, "No corresponding .idx file was found for", Dictionary::Ex )
71 DEF_EX_STR( exNoDictFile, "No corresponding .dict file was found for", Dictionary::Ex )
72 DEF_EX_STR( exNoSynFile, "No corresponding .syn file was found for", Dictionary::Ex )
73 
74 DEF_EX( ex64BitsNotSupported, "64-bit indices are not presently supported, sorry", Dictionary::Ex )
75 DEF_EX( exDicttypeNotSupported, "Dictionaries with dicttypes are not supported, sorry", Dictionary::Ex )
76 
77 DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
78 DEF_EX_STR( exWordIsTooLarge, "Enountered a word that is too large:", Dictionary::Ex )
79 DEF_EX_STR( exSuddenEndOfFile, "Sudden end of file", Dictionary::Ex )
80 DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex )
81 
82 DEF_EX_STR( exIncorrectOffset, "Incorrect offset encountered in file", Dictionary::Ex )
83 
84 /// Contents of an ifo file
85 struct Ifo
86 {
87   string version;
88   string bookname;
89   uint32_t wordcount, synwordcount, idxfilesize, idxoffsetbits;
90   string sametypesequence, dicttype, description;
91   string copyright, author, email, website, date;
92 
93   Ifo( File::Class & );
94 };
95 
96 enum
97 {
98   Signature = 0x58444953, // SIDX on little-endian, XDIS on big-endian
99   CurrentFormatVersion = 9 + BtreeIndexing::FormatVersion + Folding::Version
100 };
101 
102 struct IdxHeader
103 {
104   uint32_t signature; // First comes the signature, SIDX
105   uint32_t formatVersion; // File format version (CurrentFormatVersion)
106   uint32_t chunksOffset; // The offset to chunks' storage
107   uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
108   uint32_t indexRootOffset;
109   uint32_t wordCount; // Saved from Ifo::wordcount
110   uint32_t synWordCount; // Saved from Ifo::synwordcount
111   uint32_t bookNameSize; // Book name's length. Used to read it then.
112   uint32_t sameTypeSequenceSize; // That string's size. Used to read it then.
113   uint32_t langFrom;  // Source language
114   uint32_t langTo;    // Target language
115   uint32_t hasZipFile; // Non-zero means there's a zip file with resources present
116   uint32_t zipIndexBtreeMaxElements; // Two fields from IndexInfo of the zip
117                                      // resource index.
118   uint32_t zipIndexRootOffset;
119 }
120 #ifndef _MSC_VER
121 __attribute__((packed))
122 #endif
123 ;
124 
indexIsOldOrBad(string const & indexFile)125 bool indexIsOldOrBad( string const & indexFile )
126 {
127   File::Class idx( indexFile, "rb" );
128 
129   IdxHeader header;
130 
131   return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
132          header.signature != Signature ||
133          header.formatVersion != CurrentFormatVersion;
134 }
135 
136 class StardictDictionary: public BtreeIndexing::BtreeDictionary
137 {
138   Mutex idxMutex;
139   File::Class idx;
140   IdxHeader idxHeader;
141   string bookName;
142   string sameTypeSequence;
143   ChunkedStorage::Reader chunks;
144   Mutex dzMutex;
145   dictData * dz;
146   Mutex resourceZipMutex;
147   IndexedZip resourceZip;
148 
149 public:
150 
151   StardictDictionary( string const & id, string const & indexFile,
152                       vector< string > const & dictionaryFiles );
153 
154   ~StardictDictionary();
155 
getName()156   virtual string getName() throw()
157   { return bookName; }
158 
getProperties()159   virtual map< Dictionary::Property, string > getProperties() throw()
160   { return map< Dictionary::Property, string >(); }
161 
getArticleCount()162   virtual unsigned long getArticleCount() throw()
163   { return idxHeader.wordCount; }
164 
getWordCount()165   virtual unsigned long getWordCount() throw()
166   { return idxHeader.wordCount + idxHeader.synWordCount; }
167 
getLangFrom() const168   inline virtual quint32 getLangFrom() const
169   { return idxHeader.langFrom; }
170 
getLangTo() const171   inline virtual quint32 getLangTo() const
172   { return idxHeader.langTo; }
173 
174   virtual sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & )
175     THROW_SPEC( std::exception );
176 
177   virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
178                                                       vector< wstring > const & alts,
179                                                       wstring const &,
180                                                       bool ignoreDiacritics )
181     THROW_SPEC( std::exception );
182 
183   virtual sptr< Dictionary::DataRequest > getResource( string const & name )
184     THROW_SPEC( std::exception );
185 
186   virtual QString const& getDescription();
187 
188   virtual QString getMainFilename();
189 
190   virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
191                                                             int searchMode, bool matchCase,
192                                                             int distanceBetweenWords,
193                                                             int maxResults,
194                                                             bool ignoreWordsOrder,
195                                                             bool ignoreDiacritics );
196   virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
197 
198   virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration );
199 
setFTSParameters(Config::FullTextSearch const & fts)200   virtual void setFTSParameters( Config::FullTextSearch const & fts )
201   {
202     can_FTS = fts.enabled
203               && !fts.disabledTypes.contains( "STARDICT", Qt::CaseInsensitive )
204               && ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
205   }
206 protected:
207 
208   void loadIcon() throw();
209 
210 private:
211 
212   /// Retrieves the article's offset/size in .dict file, and its headword.
213   void getArticleProps( uint32_t articleAddress,
214                         string & headword,
215                         uint32_t & offset, uint32_t & size );
216 
217   /// Loads the article, storing its headword and formatting the data it has
218   /// into an html.
219   void loadArticle(  uint32_t address,
220                      string & headword,
221                      string & articleText );
222 
223   string loadString( size_t size );
224 
225   string handleResource( char type, char const * resource, size_t size );
226 
227   void pangoToHtml( QString & text );
228 
229   friend class StardictResourceRequest;
230   friend class StardictArticleRequest;
231   friend class StardictHeadwordsRequest;
232 };
233 
StardictDictionary(string const & id,string const & indexFile,vector<string> const & dictionaryFiles)234 StardictDictionary::StardictDictionary( string const & id,
235                                         string const & indexFile,
236                                         vector< string > const & dictionaryFiles ):
237   BtreeDictionary( id, dictionaryFiles ),
238   idx( indexFile, "rb" ),
239   idxHeader( idx.read< IdxHeader >() ),
240   bookName( loadString( idxHeader.bookNameSize ) ),
241   sameTypeSequence( loadString( idxHeader.sameTypeSequenceSize ) ),
242   chunks( idx, idxHeader.chunksOffset )
243 {
244   // Open the .dict file
245 
246   DZ_ERRORS error;
247   dz = dict_data_open( dictionaryFiles[ 2 ].c_str(), &error, 0 );
248 
249   if ( !dz )
250     throw exDictzipError( string( dz_error_str( error ) )
251                           + "(" + dictionaryFiles[ 2 ] + ")" );
252 
253   // Initialize the index
254 
255   openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
256                         idxHeader.indexRootOffset ),
257              idx, idxMutex );
258 
259   // Open a resource zip file, if there's one
260 
261   if ( idxHeader.hasZipFile &&
262        ( idxHeader.zipIndexBtreeMaxElements ||
263          idxHeader.zipIndexRootOffset ) )
264   {
265     resourceZip.openIndex( IndexInfo( idxHeader.zipIndexBtreeMaxElements,
266                                       idxHeader.zipIndexRootOffset ),
267                            idx, idxMutex );
268 
269     QString zipName = QDir::fromNativeSeparators(
270         FsEncoding::decode( getDictionaryFilenames().back().c_str() ) );
271 
272     if ( zipName.endsWith( ".zip", Qt::CaseInsensitive ) ) // Sanity check
273       resourceZip.openZipFile( zipName );
274   }
275 
276   // Full-text search parameters
277 
278   can_FTS = true;
279 
280   ftsIdxName = indexFile + "_FTS";
281 
282   if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
283       && !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
284     FTS_index_completed.ref();
285 }
286 
~StardictDictionary()287 StardictDictionary::~StardictDictionary()
288 {
289   if ( dz )
290     dict_data_close( dz );
291 }
292 
loadIcon()293 void StardictDictionary::loadIcon() throw()
294 {
295   if ( dictionaryIconLoaded )
296     return;
297 
298   QString fileName =
299     QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
300 
301   // Remove the extension
302   fileName.chop( 3 );
303 
304   if( !loadIconFromFile( fileName ) )
305   {
306     // Load failed -- use default icons
307     dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_stardict.png");
308   }
309 
310   dictionaryIconLoaded = true;
311 }
312 
loadString(size_t size)313 string StardictDictionary::loadString( size_t size )
314 {
315   if( size == 0 )
316     return string();
317 
318   vector< char > data( size );
319 
320   idx.read( &data.front(), data.size() );
321 
322   return string( &data.front(), data.size() );
323 }
324 
getArticleProps(uint32_t articleAddress,string & headword,uint32_t & offset,uint32_t & size)325 void StardictDictionary::getArticleProps( uint32_t articleAddress,
326                                           string & headword,
327                                           uint32_t & offset, uint32_t & size )
328 {
329   vector< char > chunk;
330 
331   Mutex::Lock _( idxMutex );
332 
333   char * articleData = chunks.getBlock( articleAddress, chunk );
334 
335   memcpy( &offset, articleData, sizeof( uint32_t ) );
336   articleData += sizeof( uint32_t );
337   memcpy( &size, articleData, sizeof( uint32_t ) );
338   articleData += sizeof( uint32_t );
339 
340   headword = articleData;
341 }
342 
343 class PowerWordDataProcessor{
344     class PWSyntaxTranslate{
345     public:
PWSyntaxTranslate(const char * re,const char * replacement)346         PWSyntaxTranslate(const char* re, const char* replacement)
347 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
348             : _re(re, QRegularExpression::UseUnicodePropertiesOption )
349 #else
350             : _re(re)
351 #endif
352             , _replacement(replacement)
353         {
354         }
355 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
re() const356         const QRegularExpression & re() const {
357 #else
358         const QRegExp& re() const {
359 #endif
360             return _re;
361         }
362         const QString & replacement() const {
363             return _replacement;
364         }
365     private:
366 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
367         QRegularExpression _re;
368 #else
369         QRegExp _re;
370 #endif
371         QString _replacement;
372     };
373 public:
PowerWordDataProcessor(const char * resource,size_t size)374     PowerWordDataProcessor(const char* resource, size_t size)
375         : _data(QString::fromUtf8(resource, size))
376     {
377     }
378 
process()379     string process() {
380         QDomDocument doc;
381         QString ss;
382         ss = "<div class=\"sdct_k\">";
383         if (!doc.setContent(_data)) {
384             ss += _data ;
385         } else {
386             QStringList sl;
387             walkNode(doc.firstChild(), sl);
388 
389             QStringListIterator itr(sl);
390             while (itr.hasNext()) {
391                 QString s = itr.next();
392                 translatePW(s);
393                 ss += s;
394                 ss += "<br>";
395             }
396         }
397         ss += "</div>";
398         QByteArray ba = ss.toUtf8();
399         return string(ba.data(), ba.size());
400     }
401 private:
walkNode(const QDomNode & e,QStringList & sl)402     void walkNode(const QDomNode& e, QStringList& sl) {
403         if (e.isNull()) {
404             return;
405         }
406         if (e.isText()) {
407             sl.append(e.toText().data());
408         } else {
409             QDomNodeList l = e.childNodes();
410             for (int i = 0; i < l.size(); ++i) {
411                 QDomNode n = l.at(i);
412                 if (n.isText()) {
413                     sl.append(n.toText().data());
414                 } else {
415                     walkNode(n, sl);
416                 }
417             }
418         }
419     }
420 
translatePW(QString & s)421     void translatePW(QString& s){
422         const int TRANSLATE_TBL_SIZE=5;
423         static PWSyntaxTranslate t[TRANSLATE_TBL_SIZE]={
424             PWSyntaxTranslate("&[bB]\\s*\\{([^\\{}&]+)\\}", "<B>\\1</B>"),
425             PWSyntaxTranslate("&[iI]\\s*\\{([^\\{}&]+)\\}", "<I>\\1</I>"),
426             PWSyntaxTranslate("&[uU]\\s*\\{([^\\{}&]+)\\}", "<U>\\1</U>"),
427             PWSyntaxTranslate("&[lL]\\s*\\{([^\\{}&]+)\\}", "<SPAN style=\"color:#0000ff\">\\1</SPAN>"),
428             PWSyntaxTranslate("&[2]\\s*\\{([^\\{}&]+)\\}", "<SPAN style=\"color:#0000ff\">\\1</SPAN>")
429         };
430 
431         QString old;
432         while (s.compare(old) != 0) {
433             for (int i = 0; i < TRANSLATE_TBL_SIZE; ++i) {
434                 PWSyntaxTranslate& a = t[i];
435                 s.replace(a.re(), a.replacement());
436             }
437             old = s;
438         }
439 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
440         s.replace(QRegularExpression( "&.\\s*\\{",
441                                       QRegularExpression::UseUnicodePropertiesOption
442                                       | QRegularExpression::DotMatchesEverythingOption),
443                   "");
444 #else
445         s.replace(QRegExp("&.\\s*\\{"), "");
446 #endif
447         s.replace("}", "");
448     }
449 private:
450     QString _data;
451 };
452 
453 
454 /// This function tries to make an html of the Stardict's resource typed
455 /// 'type', contained in a block pointed to by 'resource', 'size' bytes long.
handleResource(char type,char const * resource,size_t size)456 string StardictDictionary::handleResource( char type, char const * resource, size_t size )
457 {
458   QString text;
459   switch( type )
460   {
461     case 'x': // Xdxf content
462       return Xdxf2Html::convert( string( resource, size ), Xdxf2Html::STARDICT, NULL, this, &resourceZip );
463     case 'h': // Html content
464     {
465       QString articleText = QString( "<div class=\"sdct_h\">" ) + QString::fromUtf8( resource, size ) + "</div>";
466 
467 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
468       QRegularExpression imgRe( "(<\\s*img\\s+[^>]*src\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)",
469                                 QRegularExpression::CaseInsensitiveOption
470                                 | QRegularExpression::InvertedGreedinessOption );
471       QRegularExpression linkRe( "(<\\s*link\\s+[^>]*href\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)",
472                                  QRegularExpression::CaseInsensitiveOption
473                                  | QRegularExpression::InvertedGreedinessOption );
474 #else
475       QRegExp imgRe( "(<\\s*img\\s+[^>]*src\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)", Qt::CaseInsensitive );
476       imgRe.setMinimal( true );
477       QRegExp linkRe( "(<\\s*link\\s+[^>]*href\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)", Qt::CaseInsensitive );
478       linkRe.setMinimal( true );
479 #endif
480 
481       articleText.replace( imgRe , "\\1bres://" + QString::fromStdString( getId() ) + "/" )
482                  .replace( linkRe, "\\1bres://" + QString::fromStdString( getId() ) + "/" );
483 
484       // Handle links to articles
485 
486 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
487       QRegularExpression linksReg( "<a(\\s*[^>]*)href\\s*=\\s*['\"](bword://)?([^'\"]+)['\"]",
488                                    QRegularExpression::CaseInsensitiveOption );
489 #else
490       QRegExp linksReg( "<a(\\s*[^>]*)href\\s*=\\s*['\"](bword://)?([^'\"]+)['\"]", Qt::CaseInsensitive );
491       linksReg.setMinimal( true );
492 #endif
493 
494       int pos = 0;
495 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
496       QString articleNewText;
497       QRegularExpressionMatchIterator it = linksReg.globalMatch( articleText );
498       while( it.hasNext() )
499       {
500         QRegularExpressionMatch match = it.next();
501         articleNewText += articleText.midRef( pos, match.capturedStart() - pos );
502         pos = match.capturedEnd();
503 
504         QString link = match.captured( 3 );
505 #else
506       while( pos >= 0 )
507       {
508         pos = linksReg.indexIn( articleText, pos );
509         if( pos < 0 )
510           break;
511 
512         QString link = linksReg.cap( 3 );
513 #endif
514         if( link.indexOf( ':' ) < 0 )
515         {
516           QString newLink;
517           if( link.indexOf( '#' ) < 0 )
518 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
519             newLink = QString( "<a" ) + match.captured( 1 ) + "href=\"bword:" + link + "\"";
520 #else
521             newLink = QString( "<a" ) + linksReg.cap( 1 ) + "href=\"bword:" + link + "\"";
522 #endif
523 
524           // Anchors
525 
526           if( link.indexOf( '#' ) > 0 )
527           {
528 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
529             newLink = QString( "<a" ) + match.captured( 1 ) + "href=\"gdlookup://localhost/" + link + "\"";
530 #else
531             newLink = QString( "<a" ) + linksReg.cap( 1 ) + "href=\"gdlookup://localhost/" + link + "\"";
532 #endif
533             newLink.replace( "#", "?gdanchor=" );
534           }
535 
536           if( !newLink.isEmpty() )
537           {
538 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
539             articleNewText += newLink;
540 #else
541             articleText.replace( pos, linksReg.cap( 0 ).size(), newLink );
542             pos += newLink.size();
543 #endif
544           }
545           else
546 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
547             articleNewText += match.captured();
548 #else
549             pos += linksReg.cap( 0 ).size();
550 #endif
551         }
552         else
553 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
554           articleNewText += match.captured();
555       }
556       if( pos )
557       {
558         articleNewText += articleText.midRef( pos );
559         articleText = articleNewText;
560         articleNewText.clear();
561       }
562 #else
563           pos += linksReg.cap( 0 ).size();
564       }
565 #endif
566 
567       // Handle "audio" tags
568 
569 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
570       QRegularExpression audioRe( "<\\s*audio\\s*src\\s*=\\s*([\"']+)([^\"']+)([\"'])\\s*>(.*)</audio>",
571                                   QRegularExpression::CaseInsensitiveOption
572                                   | QRegularExpression::DotMatchesEverythingOption
573                                   | QRegularExpression::InvertedGreedinessOption );
574 #else
575       QRegExp audioRe( "<\\s*audio\\s*src\\s*=\\s*([\"']+)([^\"']+)([\"'])\\s*>(.*)</audio>", Qt::CaseInsensitive );
576       audioRe.setMinimal( true );
577 #endif
578 
579       pos = 0;
580 
581 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
582       it = audioRe.globalMatch( articleText );
583       while( it.hasNext() )
584       {
585         QRegularExpressionMatch match = it.next();
586         articleNewText += articleText.midRef( pos, match.capturedStart() - pos );
587         pos = match.capturedEnd();
588 
589         QString src = match.captured( 2 );
590 #else
591       while( pos >= 0 )
592       {
593         pos = audioRe.indexIn( articleText, pos );
594         if( pos < 0 )
595           break;
596 
597         QString src = audioRe.cap( 2 );
598 #endif
599         if( src.indexOf( "://" ) >= 0 )
600 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
601           articleNewText += match.captured();
602 #else
603           pos += audioRe.cap( 0 ).length();
604 #endif
605         else
606         {
607           std::string href = "\"gdau://" + getId() + "/" + src.toUtf8().data() + "\"";
608           QString newTag = QString::fromUtf8( ( addAudioLink( href, getId() ) + "<span class=\"sdict_h_wav\"><a href=" + href + ">" ).c_str() );
609 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
610           newTag += match.captured( 4 );
611           if( match.captured( 4 ).indexOf( "<img " ) < 0 )
612 #else
613           newTag += audioRe.cap( 4 );
614           if( audioRe.cap( 4 ).indexOf( "<img " ) < 0 )
615 #endif
616             newTag += " <img src=\"qrcx://localhost/icons/playsound.png\" border=\"0\" alt=\"Play\">";
617           newTag += "</a></span>";
618 
619 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
620           articleNewText += newTag;
621 #else
622           articleText.replace( pos, audioRe.cap( 0 ).length(), newTag );
623           pos += newTag.length();
624 #endif
625         }
626       }
627 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
628       if( pos )
629       {
630         articleNewText += articleText.midRef( pos );
631         articleText = articleNewText;
632         articleNewText.clear();
633       }
634 #endif
635 
636       return ( articleText.toUtf8().data() );
637     }
638     case 'm': // Pure meaning, usually means preformatted text
639       return "<div class=\"sdct_m\">" + Html::preformat( string( resource, size ), isToLanguageRTL() ) + "</div>";
640     case 'l': // Same as 'm', but not in utf8, instead in current locale's
641               // encoding.
642               // We just use Qt here, it should know better about system's
643               // locale.
644       return "<div class=\"sdct_l\">" + Html::preformat( QString::fromLocal8Bit( resource, size ).toUtf8().data(),
645                                                          isToLanguageRTL() )
646                                       + "</div>";
647     case 'g': // Pango markup.
648       text = QString::fromUtf8( resource, size );
649       pangoToHtml( text );
650       return "<div class=\"sdct_g\">" + string( text.toUtf8().data() ) + "</div>";
651     case 't': // Transcription
652       return "<div class=\"sdct_t\">" + Html::escape( string( resource, size ) ) + "</div>";
653     case 'y': // Chinese YinBiao or Japanese KANA. Examples are needed. For now,
654               // just output as pure escaped utf8.
655       return "<div class=\"sdct_y\">" + Html::escape( string( resource, size ) ) + "</div>";
656     case 'k': // KingSoft PowerWord data.
657     {
658       PowerWordDataProcessor pwdp(resource, size);
659       return pwdp.process();
660     }
661     case 'w': // MediaWiki markup. We don't handle this right now.
662       return "<div class=\"sdct_w\">" + Html::escape( string( resource, size ) ) + "</div>";
663     case 'n': // WordNet data. We don't know anything about it.
664       return "<div class=\"sdct_n\">" + Html::escape( string( resource, size ) ) + "</div>";
665 
666     case 'r': // Resource file list. For now, resources aren't handled.
667       return "<div class=\"sdct_r\">" + Html::escape( string( resource, size ) ) + "</div>";
668 
669     case 'W': // An embedded Wav file. Unhandled yet.
670       return "<div class=\"sdct_W\">(an embedded .wav file)</div>";
671     case 'P': // An embedded picture file. Unhandled yet.
672       return "<div class=\"sdct_P\">(an embedded picture file)</div>";
673   }
674 
675   if ( islower( type ) )
676   {
677     return string( "<b>Unknown textual entry type " ) + string( 1, type ) + ":</b> " + Html::escape( string( resource, size ) ) + "<br>";
678   }
679   else
680     return string( "<b>Unknown blob entry type " ) + string( 1, type ) + "</b><br>";
681 }
682 
683 void StardictDictionary::pangoToHtml( QString & text )
684 {
685 /*
686  * Partially support for Pango Markup Language
687  * Attributes "fallback", "lang", "gravity", "gravity_hint" just ignored
688  */
689 
690   QRegExp spanRegex( "<span\\s*([^>]*)>", Qt::CaseInsensitive );
691   QRegExp styleRegex( "(\\w+)=\"([^\"]*)\"" );
692 
693   text.replace( "\n", "<br>" );
694 
695   int pos = 0;
696   do
697   {
698     pos = spanRegex.indexIn( text, pos );
699     if( pos >= 0 )
700     {
701       QString styles = spanRegex.cap( 1 );
702       QString newSpan( "<span style=\"" );
703       int stylePos = 0;
704       do
705       {
706         stylePos = styleRegex.indexIn( styles, stylePos );
707         QString style = styleRegex.cap( 1 );
708         if( stylePos >= 0 )
709         {
710           if( style.compare( "font_desc", Qt::CaseInsensitive ) == 0
711               || style.compare( "font", Qt::CaseInsensitive ) == 0 )
712           {
713             // Parse font description
714 
715             QStringList list = styleRegex.cap( 2 ).split( " ", QString::SkipEmptyParts );
716             int n;
717             QString sizeStr, stylesStr, familiesStr;
718             for( n = list.size() - 1; n >= 0; n-- )
719             {
720               QString str = list.at( n );
721 
722               // font size
723               if( str[ 0 ].isNumber() )
724               {
725                 sizeStr = QString( "font-size:" ) + str + ";";
726                 continue;
727               }
728 
729               // font style
730               if( str.compare( "normal", Qt::CaseInsensitive ) == 0
731                   || str.compare( "oblique", Qt::CaseInsensitive ) == 0
732                   || str.compare( "italic", Qt::CaseInsensitive ) == 0 )
733               {
734                 if( !stylesStr.contains( "font-style:" ) )
735                   stylesStr += QString( "font-style:" ) + str + ";";
736                 continue;
737               }
738 
739               // font variant
740               if( str.compare( "smallcaps", Qt::CaseInsensitive ) == 0 )
741               {
742                 stylesStr += QString( "font-variant:small-caps" ) ;
743                 continue;
744               }
745 
746               // font weight
747               if( str.compare( "ultralight", Qt::CaseInsensitive ) == 0 )
748               {
749                 stylesStr += QString( "font-weight:100;" );
750                 continue;
751               }
752               if( str.compare( "light", Qt::CaseInsensitive ) == 0 )
753               {
754                 stylesStr += QString( "font-weight:200;" );
755                 continue;
756               }
757               if( str.compare( "bold", Qt::CaseInsensitive ) == 0 )
758               {
759                 stylesStr += QString( "font-weight:bold;" );
760                 continue;
761               }
762               if( str.compare( "ultrabold", Qt::CaseInsensitive ) == 0 )
763               {
764                 stylesStr += QString( "font-weight:800;" );
765                 continue;
766               }
767               if( str.compare( "heavy", Qt::CaseInsensitive ) == 0 )
768               {
769                 stylesStr += QString( "font-weight:900" );
770                 continue;
771               }
772 
773               // font stretch
774               if( str.compare( "ultracondensed", Qt::CaseInsensitive ) == 0 )
775               {
776                 stylesStr += QString( "font-stretch:ultra-condensed;" );
777                 continue;
778               }
779               if( str.compare( "extracondensed", Qt::CaseInsensitive ) == 0 )
780               {
781                 stylesStr += QString( "font-stretch:extra-condensed;" );
782                 continue;
783               }
784               if( str.compare( "semicondensed", Qt::CaseInsensitive ) == 0 )
785               {
786                 stylesStr += QString( "font-stretch:semi-condensed;" );
787                 continue;
788               }
789               if( str.compare( "semiexpanded", Qt::CaseInsensitive ) == 0 )
790               {
791                 stylesStr += QString( "font-stretch:semi-expanded;" );
792                 continue;
793               }
794               if( str.compare( "extraexpanded", Qt::CaseInsensitive ) == 0 )
795               {
796                 stylesStr += QString( "font-stretch:extra-expanded;" );
797                 continue;
798               }
799               if( str.compare( "ultraexpanded", Qt::CaseInsensitive ) == 0 )
800               {
801                 stylesStr += QString( "font-stretch:ultra-expanded;" );
802                 continue;
803               }
804               if( str.compare( "condensed", Qt::CaseInsensitive ) == 0
805                   || str.compare( "expanded", Qt::CaseInsensitive ) == 0 )
806               {
807                 stylesStr += QString( "font-stretch:" ) + str + ";";
808                 continue;
809               }
810 
811               // gravity
812               if( str.compare( "south", Qt::CaseInsensitive ) == 0
813                   || str.compare( "east", Qt::CaseInsensitive ) == 0
814                   || str.compare( "north", Qt::CaseInsensitive ) == 0
815                   || str.compare( "west", Qt::CaseInsensitive ) == 0
816                   || str.compare( "auto", Qt::CaseInsensitive ) == 0 )
817               {
818                 continue;
819               }
820               break;
821             }
822 
823             // last words is families list
824             if( n >= 0 )
825             {
826               familiesStr = QString( "font-family:" );
827               for( int i = 0; i <= n; i++ )
828               {
829                 if( i > 0 && !familiesStr.endsWith( ',' ) )
830                   familiesStr += ",";
831                 familiesStr += list.at( i );
832               }
833               familiesStr += ";";
834             }
835 
836             newSpan += familiesStr + stylesStr + sizeStr;
837           }
838           else if( style.compare( "font_family", Qt::CaseInsensitive ) == 0
839                    || style.compare( "face", Qt::CaseInsensitive ) == 0 )
840             newSpan += QString( "font-family:" ) + styleRegex.cap( 2 ) + ";";
841           else if( style.compare( "font_size", Qt::CaseInsensitive ) == 0
842                    || style.compare( "size", Qt::CaseInsensitive ) == 0 )
843           {
844             if( styleRegex.cap( 2 )[ 0 ].isLetter()
845                 || styleRegex.cap( 2 ).endsWith( "px", Qt::CaseInsensitive )
846                 || styleRegex.cap( 2 ).endsWith( "pt", Qt::CaseInsensitive )
847                 || styleRegex.cap( 2 ).endsWith( "em", Qt::CaseInsensitive )
848                 || styleRegex.cap( 2 ).endsWith( "%" ) )
849               newSpan += QString( "font-size:" ) + styleRegex.cap( 2 ) +";";
850             else
851             {
852               int size = styleRegex.cap( 2 ).toInt();
853               if( size )
854                 newSpan += QString( "font-size:%1pt;" ).arg( size / 1024.0, 0, 'f', 3 );
855             }
856           }
857           else if( style.compare( "font_style", Qt::CaseInsensitive ) == 0
858                    || style.compare( "style", Qt::CaseInsensitive ) == 0)
859             newSpan += QString( "font-style:" ) + styleRegex.cap( 2 ) + ";";
860           else if( style.compare( "weight", Qt::CaseInsensitive ) == 0
861                    || style.compare( "weight", Qt::CaseInsensitive ) == 0)
862           {
863             QString str = styleRegex.cap( 2 );
864             if( str.compare( "ultralight", Qt::CaseInsensitive ) == 0 )
865               newSpan += QString( "font-weight:100;" );
866             else if( str.compare( "light", Qt::CaseInsensitive ) == 0 )
867               newSpan += QString( "font-weight:200;" );
868             else if( str.compare( "ultrabold", Qt::CaseInsensitive ) == 0 )
869               newSpan += QString( "font-weight:800;" );
870             else if( str.compare( "heavy", Qt::CaseInsensitive ) == 0 )
871               newSpan += QString( "font-weight:900" );
872             else
873               newSpan += QString( "font-weight:" ) + str + ";";
874           }
875           else if( style.compare( "font_variant", Qt::CaseInsensitive ) == 0
876                    || style.compare( "variant", Qt::CaseInsensitive ) == 0 )
877           {
878             if( styleRegex.cap( 2 ).compare( "smallcaps", Qt::CaseInsensitive ) == 0 )
879               newSpan += QString( "font-variant:small-caps" );
880             else
881               newSpan += QString( "font-variant:" ) + styleRegex.cap( 2 ) + ";";
882           }
883           else if( style.compare( "font_stretch", Qt::CaseInsensitive ) == 0
884                    || style.compare( "stretch", Qt::CaseInsensitive ) == 0 )
885           {
886             QString str = styleRegex.cap( 2 );
887             if( str.compare( "ultracondensed", Qt::CaseInsensitive ) == 0 )
888               newSpan += QString( "font-stretch:ultra-condensed;" );
889             else if( str.compare( "extracondensed", Qt::CaseInsensitive ) == 0 )
890               newSpan += QString( "font-stretch:extra-condensed;" );
891             else if( str.compare( "semicondensed", Qt::CaseInsensitive ) == 0 )
892               newSpan += QString( "font-stretch:semi-condensed;" );
893             else if( str.compare( "semiexpanded", Qt::CaseInsensitive ) == 0 )
894               newSpan += QString( "font-stretch:semi-expanded;" );
895             else if( str.compare( "extraexpanded", Qt::CaseInsensitive ) == 0 )
896               newSpan += QString( "font-stretch:extra-expanded;" );
897             else if( str.compare( "ultraexpanded", Qt::CaseInsensitive ) == 0 )
898               newSpan += QString( "font-stretch:ultra-expanded;" );
899             else
900               newSpan += QString( "font-stretch:" ) + str + ";";
901           }
902           else if( style.compare( "foreground", Qt::CaseInsensitive ) == 0
903                    || style.compare( "fgcolor", Qt::CaseInsensitive ) == 0
904                    || style.compare( "color", Qt::CaseInsensitive ) == 0 )
905             newSpan += QString( "color:" ) + styleRegex.cap( 2 ) + ";";
906           else if( style.compare( "background", Qt::CaseInsensitive ) == 0
907                    || style.compare( "bgcolor", Qt::CaseInsensitive ) == 0 )
908             newSpan += QString( "background-color:" ) + styleRegex.cap( 2 ) + ";";
909           else if( style.compare( "underline_color", Qt::CaseInsensitive ) == 0
910                    || style.compare( "strikethrough_color", Qt::CaseInsensitive ) == 0 )
911             newSpan += QString( "text-decoration-color:" ) + styleRegex.cap( 2 ) + ";";
912           else if( style.compare( "underline", Qt::CaseInsensitive ) == 0 )
913           {
914             if( styleRegex.cap( 2 ).compare( "none", Qt::CaseInsensitive ) )
915               newSpan += QString( "text-decoration-line:none;" );
916             else
917             {
918               newSpan += QString( "text-decoration-line:underline; " );
919               if( styleRegex.cap( 2 ).compare( "low", Qt::CaseInsensitive ) )
920                 newSpan += QString( "text-decoration-style:dotted;" );
921               else if( styleRegex.cap( 2 ).compare( "single", Qt::CaseInsensitive ) )
922                 newSpan += QString( "text-decoration-style:solid;" );
923               else if( styleRegex.cap( 2 ).compare( "error", Qt::CaseInsensitive ) )
924                 newSpan += QString( "text-decoration-style:wavy;" );
925               else
926                 newSpan += QString( "text-decoration-style:" ) + styleRegex.cap( 2 ) + ";";
927             }
928           }
929           else if( style.compare( "strikethrough", Qt::CaseInsensitive ) == 0 )
930           {
931             if( styleRegex.cap( 2 ).compare( "true", Qt::CaseInsensitive ) )
932               newSpan += QString( "text-decoration-line:line-through;" );
933             else
934               newSpan += QString( "text-decoration-line:none;" );
935           }
936           else if( style.compare( "rise", Qt::CaseInsensitive ) == 0 )
937           {
938             if( styleRegex.cap( 2 ).endsWith( "px", Qt::CaseInsensitive )
939                 || styleRegex.cap( 2 ).endsWith( "pt", Qt::CaseInsensitive )
940                 || styleRegex.cap( 2 ).endsWith( "em", Qt::CaseInsensitive )
941                 || styleRegex.cap( 2 ).endsWith( "%" ) )
942               newSpan += QString( "vertical-align:" ) + styleRegex.cap( 2 ) +";";
943             else
944             {
945               int riseValue = styleRegex.cap( 2 ).toInt();
946               if( riseValue )
947                 newSpan += QString( "vertical-align:%1pt;" ).arg( riseValue / 1024.0, 0, 'f', 3 );
948             }
949           }
950           else if( style.compare( "letter_spacing", Qt::CaseInsensitive ) == 0 )
951           {
952             if( styleRegex.cap( 2 ).endsWith( "px", Qt::CaseInsensitive )
953                 || styleRegex.cap( 2 ).endsWith( "pt", Qt::CaseInsensitive )
954                 || styleRegex.cap( 2 ).endsWith( "em", Qt::CaseInsensitive )
955                 || styleRegex.cap( 2 ).endsWith( "%" ) )
956               newSpan += QString( "letter-spacing:" ) + styleRegex.cap( 2 ) +";";
957             else
958             {
959               int spacing = styleRegex.cap( 2 ).toInt();
960               if( spacing )
961                 newSpan += QString( "letter-spacing:%1pt;" ).arg( spacing / 1024.0, 0, 'f', 3 );
962             }
963           }
964 
965           stylePos += styleRegex.matchedLength();
966         }
967       }
968       while( stylePos >= 0 );
969 
970       newSpan += "\">";
971       text.replace( pos, spanRegex.matchedLength(), newSpan );
972       pos += newSpan.size();
973     }
974   }
975   while( pos >= 0 );
976 
977   text.replace( "  ", "&nbsp;&nbsp;" );
978 }
979 
980 void StardictDictionary::loadArticle( uint32_t address,
981                                       string & headword,
982                                       string & articleText )
983 {
984   uint32_t offset, size;
985 
986   getArticleProps( address, headword, offset, size );
987 
988   char * articleBody;
989 
990   {
991     Mutex::Lock _( dzMutex );
992 
993     // Note that the function always zero-pads the result.
994     articleBody = dict_data_read_( dz, offset, size, 0, 0 );
995   }
996 
997   if ( !articleBody )
998   {
999 //    throw exCantReadFile( getDictionaryFilenames()[ 2 ] );
1000     articleText = string( "<div class=\"sdict_m\">DICTZIP error: " ) + dict_error_str( dz ) + "</div>";
1001     return;
1002   }
1003 
1004   articleText.clear();
1005 
1006   char * ptr = articleBody;
1007 
1008   if ( sameTypeSequence.size() )
1009   {
1010     /// The sequence is known, it's not stored in the article itself
1011     for( unsigned seq = 0; seq < sameTypeSequence.size(); ++seq )
1012     {
1013       // Last entry doesn't have size info -- it is inferred from
1014       // the bytes left
1015       bool entrySizeKnown = ( seq == sameTypeSequence.size() - 1 );
1016 
1017       uint32_t entrySize = 0;
1018 
1019       if ( entrySizeKnown )
1020         entrySize = size;
1021       else
1022       if ( !size )
1023       {
1024         gdWarning( "Stardict: short entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1025         break;
1026       }
1027 
1028       char type = sameTypeSequence[ seq ];
1029 
1030       if ( islower( type ) )
1031       {
1032         // Zero-terminated entry, unless it's the last one
1033         if ( !entrySizeKnown )
1034           entrySize = strlen( ptr );
1035 
1036         if ( size < entrySize )
1037         {
1038           gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1039           break;
1040         }
1041 
1042         articleText += handleResource( type, ptr, entrySize );
1043 
1044         if ( !entrySizeKnown )
1045           ++entrySize; // Need to skip the zero byte
1046 
1047         ptr += entrySize;
1048         size -= entrySize;
1049       }
1050       else
1051       if ( isupper( *ptr ) )
1052       {
1053         // An entry which has its size before contents, unless it's the last one
1054 
1055         if ( !entrySizeKnown )
1056         {
1057           if ( size < sizeof( uint32_t ) )
1058           {
1059             gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1060             break;
1061           }
1062 
1063           memcpy( &entrySize, ptr, sizeof( uint32_t ) );
1064 
1065           entrySize = ntohl( entrySize );
1066 
1067           ptr += sizeof( uint32_t );
1068           size -= sizeof( uint32_t );
1069         }
1070 
1071         if ( size < entrySize )
1072         {
1073           gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1074           break;
1075         }
1076 
1077         articleText += handleResource( type, ptr, entrySize );
1078 
1079         ptr += entrySize;
1080         size -= entrySize;
1081       }
1082       else
1083       {
1084         gdWarning( "Stardict: non-alpha entry type 0x%x for the word %s encountered in \"%s\".\n",
1085                    type, headword.c_str(), getName().c_str() );
1086         break;
1087       }
1088     }
1089   }
1090   else
1091   {
1092     // The sequence is stored in each article separately
1093     while( size )
1094     {
1095       if ( islower( *ptr ) )
1096       {
1097         // Zero-terminated entry
1098         size_t len = strlen( ptr + 1 );
1099 
1100         if ( size < len + 2 )
1101         {
1102           gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1103           break;
1104         }
1105 
1106         articleText += handleResource( *ptr, ptr + 1, len );
1107 
1108         ptr += len + 2;
1109         size -= len + 2;
1110       }
1111       else
1112       if ( isupper( *ptr ) )
1113       {
1114         // An entry which havs its size before contents
1115         if ( size < sizeof( uint32_t ) + 1 )
1116         {
1117           gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1118           break;
1119         }
1120 
1121         uint32_t entrySize;
1122 
1123         memcpy( &entrySize, ptr + 1, sizeof( uint32_t ) );
1124 
1125         entrySize = ntohl( entrySize );
1126 
1127         if ( size < sizeof( uint32_t ) + 1 + entrySize )
1128         {
1129           gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1130           break;
1131         }
1132 
1133         articleText += handleResource( *ptr, ptr + 1 + sizeof( uint32_t ), entrySize );
1134 
1135         ptr += sizeof( uint32_t ) + 1 + entrySize;
1136         size -= sizeof( uint32_t ) + 1 + entrySize;
1137       }
1138       else
1139       {
1140         gdWarning( "Stardict: non-alpha entry type 0x%x for the word %s encountered in \"%s\".\n",
1141                    (unsigned)*ptr, headword.c_str(), getName().c_str() );
1142         break;
1143       }
1144     }
1145   }
1146 
1147   free( articleBody );
1148 }
1149 
1150 QString const& StardictDictionary::getDescription()
1151 {
1152     if( !dictionaryDescription.isEmpty() )
1153         return dictionaryDescription;
1154 
1155     File::Class ifoFile( getDictionaryFilenames()[ 0 ], "r" );
1156     Ifo ifo( ifoFile );
1157 
1158     if( !ifo.copyright.empty() )
1159     {
1160       QString copyright = QString::fromUtf8( ifo.copyright.c_str() )
1161                           .replace( "<br>", "\n", Qt::CaseInsensitive );
1162       dictionaryDescription += QString( QObject::tr( "Copyright: %1%2" ) )
1163                               .arg( copyright )
1164                               .arg( "\n\n" );
1165     }
1166 
1167     if( !ifo.author.empty() )
1168     {
1169       QString author = QString::fromUtf8( ifo.author.c_str() );
1170       dictionaryDescription += QString( QObject::tr( "Author: %1%2" ) )
1171                               .arg( author )
1172                               .arg( "\n\n" );
1173     }
1174 
1175     if( !ifo.email.empty() )
1176     {
1177       QString email = QString::fromUtf8( ifo.email.c_str() );
1178       dictionaryDescription += QString( QObject::tr( "E-mail: %1%2" ) )
1179                                .arg( email )
1180                                .arg( "\n\n" );
1181     }
1182 
1183     if( !ifo.website.empty() )
1184     {
1185       QString website = QString::fromUtf8( ifo.website.c_str() );
1186       dictionaryDescription += QString( QObject::tr( "Website: %1%2" ) )
1187                                .arg( website )
1188                                .arg( "\n\n" );
1189     }
1190 
1191     if( !ifo.date.empty() )
1192     {
1193       QString date = QString::fromUtf8( ifo.date.c_str() );
1194       dictionaryDescription += QString( QObject::tr( "Date: %1%2" ) )
1195                                .arg( date )
1196                                .arg( "\n\n" );
1197     }
1198 
1199     if( !ifo.description.empty() )
1200     {
1201       QString desc = QString::fromUtf8( ifo.description.c_str() );
1202       desc.replace( "\t", "<br/>" );
1203       desc.replace( "\\n", "<br/>" );
1204       desc.replace( "<br>", "<br/>", Qt::CaseInsensitive );
1205       dictionaryDescription += Html::unescape( desc, true );
1206     }
1207 
1208     if( dictionaryDescription.isEmpty() )
1209       dictionaryDescription = "NONE";
1210 
1211     return dictionaryDescription;
1212 }
1213 
1214 QString StardictDictionary::getMainFilename()
1215 {
1216   return FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() );
1217 }
1218 
1219 void StardictDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
1220 {
1221   if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
1222          || FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
1223     FTS_index_completed.ref();
1224 
1225   if( haveFTSIndex() )
1226     return;
1227 
1228   if( ensureInitDone().size() )
1229     return;
1230 
1231   if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
1232     return;
1233 
1234   gdDebug( "Stardict: Building the full-text index for dictionary: %s\n",
1235            getName().c_str() );
1236 
1237   try
1238   {
1239     FtsHelpers::makeFTSIndex( this, isCancelled );
1240     FTS_index_completed.ref();
1241   }
1242   catch( std::exception &ex )
1243   {
1244     gdWarning( "Stardict: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
1245     QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
1246   }
1247 }
1248 
1249 void StardictDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
1250 {
1251   try
1252   {
1253     string headwordStr, articleStr;
1254     loadArticle( articleAddress, headwordStr, articleStr );
1255 
1256     headword = QString::fromUtf8( headwordStr.data(), headwordStr.size() );
1257 
1258     wstring wstr = Utf8::decode( articleStr );
1259 
1260     text = Html::unescape( gd::toQString( wstr ) );
1261   }
1262   catch( std::exception &ex )
1263   {
1264     gdWarning( "Stardict: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
1265   }
1266 }
1267 
1268 sptr< Dictionary::DataRequest > StardictDictionary::getSearchResults( QString const & searchString,
1269                                                                       int searchMode, bool matchCase,
1270                                                                       int distanceBetweenWords,
1271                                                                       int maxResults,
1272                                                                       bool ignoreWordsOrder,
1273                                                                       bool ignoreDiacritics )
1274 {
1275   return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics );
1276 }
1277 
1278 /// StardictDictionary::findHeadwordsForSynonym()
1279 
1280 class StardictHeadwordsRequest;
1281 
1282 class StardictHeadwordsRequestRunnable: public QRunnable
1283 {
1284   StardictHeadwordsRequest & r;
1285   QSemaphore & hasExited;
1286 
1287 public:
1288 
1289   StardictHeadwordsRequestRunnable( StardictHeadwordsRequest & r_,
1290                                     QSemaphore & hasExited_ ): r( r_ ),
1291                                                                hasExited( hasExited_ )
1292   {}
1293 
1294   ~StardictHeadwordsRequestRunnable()
1295   {
1296     hasExited.release();
1297   }
1298 
1299   virtual void run();
1300 };
1301 
1302 class StardictHeadwordsRequest: public Dictionary::WordSearchRequest
1303 {
1304   friend class StardictHeadwordsRequestRunnable;
1305 
1306   wstring word;
1307   StardictDictionary & dict;
1308 
1309   QAtomicInt isCancelled;
1310   QSemaphore hasExited;
1311 
1312 public:
1313 
1314   StardictHeadwordsRequest( wstring const & word_,
1315                             StardictDictionary & dict_ ):
1316     word( word_ ), dict( dict_ )
1317   {
1318     QThreadPool::globalInstance()->start(
1319       new StardictHeadwordsRequestRunnable( *this, hasExited ) );
1320   }
1321 
1322   void run(); // Run from another thread by StardictHeadwordsRequestRunnable
1323 
1324   virtual void cancel()
1325   {
1326     isCancelled.ref();
1327   }
1328 
1329   ~StardictHeadwordsRequest()
1330   {
1331     isCancelled.ref();
1332     hasExited.acquire();
1333   }
1334 };
1335 
1336 void StardictHeadwordsRequestRunnable::run()
1337 {
1338   r.run();
1339 }
1340 
1341 void StardictHeadwordsRequest::run()
1342 {
1343   if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1344   {
1345     finish();
1346     return;
1347   }
1348 
1349   try
1350   {
1351     vector< WordArticleLink > chain = dict.findArticles( word );
1352 
1353     wstring caseFolded = Folding::applySimpleCaseOnly( word );
1354 
1355     for( unsigned x = 0; x < chain.size(); ++x )
1356     {
1357       if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1358       {
1359         finish();
1360         return;
1361       }
1362 
1363       string headword, articleText;
1364 
1365       dict.loadArticle( chain[ x ].articleOffset,
1366                         headword, articleText );
1367 
1368       wstring headwordDecoded = Utf8::decode( headword );
1369 
1370       if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) )
1371       {
1372         // The headword seems to differ from the input word, which makes the
1373         // input word its synonym.
1374         Mutex::Lock _( dataMutex );
1375 
1376         matches.push_back( headwordDecoded );
1377       }
1378     }
1379   }
1380   catch( std::exception & e )
1381   {
1382     setErrorString( QString::fromUtf8( e.what() ) );
1383   }
1384 
1385   finish();
1386 }
1387 
1388 sptr< Dictionary::WordSearchRequest >
1389   StardictDictionary::findHeadwordsForSynonym( wstring const & word )
1390   THROW_SPEC( std::exception )
1391 {
1392   return synonymSearchEnabled ? new StardictHeadwordsRequest( word, *this ) :
1393                                 Class::findHeadwordsForSynonym( word );
1394 }
1395 
1396 
1397 /// StardictDictionary::getArticle()
1398 
1399 class StardictArticleRequest;
1400 
1401 class StardictArticleRequestRunnable: public QRunnable
1402 {
1403   StardictArticleRequest & r;
1404   QSemaphore & hasExited;
1405 
1406 public:
1407 
1408   StardictArticleRequestRunnable( StardictArticleRequest & r_,
1409                                   QSemaphore & hasExited_ ): r( r_ ),
1410                                                              hasExited( hasExited_ )
1411   {}
1412 
1413   ~StardictArticleRequestRunnable()
1414   {
1415     hasExited.release();
1416   }
1417 
1418   virtual void run();
1419 };
1420 
1421 class StardictArticleRequest: public Dictionary::DataRequest
1422 {
1423   friend class StardictArticleRequestRunnable;
1424 
1425   wstring word;
1426   vector< wstring > alts;
1427   StardictDictionary & dict;
1428   bool ignoreDiacritics;
1429 
1430   QAtomicInt isCancelled;
1431   QSemaphore hasExited;
1432 
1433 public:
1434 
1435   StardictArticleRequest( wstring const & word_,
1436                      vector< wstring > const & alts_,
1437                      StardictDictionary & dict_,
1438                      bool ignoreDiacritics_ ):
1439     word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ )
1440   {
1441     QThreadPool::globalInstance()->start(
1442       new StardictArticleRequestRunnable( *this, hasExited ) );
1443   }
1444 
1445   void run(); // Run from another thread by StardictArticleRequestRunnable
1446 
1447   virtual void cancel()
1448   {
1449     isCancelled.ref();
1450   }
1451 
1452   ~StardictArticleRequest()
1453   {
1454     isCancelled.ref();
1455     hasExited.acquire();
1456   }
1457 };
1458 
1459 void StardictArticleRequestRunnable::run()
1460 {
1461   r.run();
1462 }
1463 
1464 void StardictArticleRequest::run()
1465 {
1466   if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1467   {
1468     finish();
1469     return;
1470   }
1471 
1472   try
1473   {
1474     vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
1475 
1476     for( unsigned x = 0; x < alts.size(); ++x )
1477     {
1478       /// Make an additional query for each alt
1479 
1480       vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
1481 
1482       chain.insert( chain.end(), altChain.begin(), altChain.end() );
1483     }
1484 
1485     multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
1486 
1487     set< uint32_t > articlesIncluded; // Some synonims make it that the articles
1488                                       // appear several times. We combat this
1489                                       // by only allowing them to appear once.
1490 
1491     wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
1492     if( ignoreDiacritics )
1493       wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
1494 
1495     for( unsigned x = 0; x < chain.size(); ++x )
1496     {
1497       if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1498       {
1499         finish();
1500         return;
1501       }
1502 
1503       if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
1504         continue; // We already have this article in the body.
1505 
1506       // Now grab that article
1507 
1508       string headword, articleText;
1509 
1510       dict.loadArticle( chain[ x ].articleOffset, headword, articleText );
1511 
1512       // Ok. Now, does it go to main articles, or to alternate ones? We list
1513       // main ones first, and alternates after.
1514 
1515       // We do the case-folded comparison here.
1516 
1517       wstring headwordStripped =
1518         Folding::applySimpleCaseOnly( Utf8::decode( headword ) );
1519       if( ignoreDiacritics )
1520         headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
1521 
1522       multimap< wstring, pair< string, string > > & mapToUse =
1523         ( wordCaseFolded == headwordStripped ) ?
1524           mainArticles : alternateArticles;
1525 
1526       mapToUse.insert( pair< wstring, pair< string, string > >(
1527         Folding::applySimpleCaseOnly( Utf8::decode( headword ) ),
1528         pair< string, string >( headword, articleText ) ) );
1529 
1530       articlesIncluded.insert( chain[ x ].articleOffset );
1531     }
1532 
1533     if ( mainArticles.empty() && alternateArticles.empty() )
1534     {
1535       // No such word
1536       finish();
1537       return;
1538     }
1539 
1540     string result;
1541 
1542     multimap< wstring, pair< string, string > >::const_iterator i;
1543 
1544     string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
1545                      "</font>""</font>""</font>""</font>""</font>""</font>"
1546                      "</b></b></b></b></b></b></b></b>"
1547                      "</i></i></i></i></i></i></i></i>";
1548 
1549     for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
1550     {
1551         result += dict.isFromLanguageRTL() ? "<h3 class=\"sdct_headwords\" dir=\"rtl\">" : "<h3 class=\"sdct_headwords\">";
1552         result += i->second.first;
1553         result += "</h3>";
1554         if( dict.isToLanguageRTL() )
1555           result += "<div style=\"display:inline;\" dir=\"rtl\">";
1556         result += i->second.second;
1557         result += cleaner;
1558         if( dict.isToLanguageRTL() )
1559           result += "</div>";
1560     }
1561 
1562     for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
1563     {
1564         result += dict.isFromLanguageRTL() ? "<h3 class=\"sdct_headwords\" dir=\"rtl\">" : "<h3 class=\"sdct_headwords\">";
1565         result += i->second.first;
1566         result += "</h3>";
1567         if( dict.isToLanguageRTL() )
1568           result += "<div style=\"display:inline;\" dir=\"rtl\">";
1569         result += i->second.second;
1570         result += cleaner;
1571         if( dict.isToLanguageRTL() )
1572           result += "</div>";
1573     }
1574 
1575     Mutex::Lock _( dataMutex );
1576 
1577     data.resize( result.size() );
1578 
1579     memcpy( &data.front(), result.data(), result.size() );
1580 
1581     hasAnyData = true;
1582   }
1583   catch( std::exception & e )
1584   {
1585     setErrorString( QString::fromUtf8( e.what() ) );
1586   }
1587 
1588   finish();
1589 }
1590 
1591 sptr< Dictionary::DataRequest > StardictDictionary::getArticle( wstring const & word,
1592                                                                 vector< wstring > const & alts,
1593                                                                 wstring const &,
1594                                                                 bool ignoreDiacritics )
1595   THROW_SPEC( std::exception )
1596 {
1597   return new StardictArticleRequest( word, alts, *this, ignoreDiacritics );
1598 }
1599 
1600 
1601 static char const * beginsWith( char const * substr, char const * str )
1602 {
1603   size_t len = strlen( substr );
1604 
1605   return strncmp( str, substr, len ) == 0 ? str + len : 0;
1606 }
1607 
1608 Ifo::Ifo( File::Class & f ):
1609   wordcount( 0 ), synwordcount( 0 ), idxfilesize( 0 ), idxoffsetbits( 32 )
1610 {
1611   static string const versionEq( "version=" );
1612 
1613   static string const booknameEq( "bookname=" );
1614 
1615   //DPRINTF( "%s<\n", f.gets().c_str() );
1616   //DPRINTF( "%s<\n", f.gets().c_str() );
1617 
1618   if ( QString::fromUtf8(f.gets().c_str()) != "StarDict's dict ifo file" ||
1619        f.gets().compare( 0, versionEq.size(), versionEq ) )
1620     throw exNotAnIfoFile();
1621 
1622   /// Now go through the file and parse options
1623 
1624   try
1625   {
1626     char option[ 16384 ];
1627 
1628     for( ; ; )
1629     {
1630       if ( !f.gets( option, sizeof( option ), true ) )
1631         break;
1632 
1633       if ( char const * val = beginsWith( "bookname=", option ) )
1634         bookname = val;
1635       else
1636       if ( char const * val = beginsWith( "wordcount=", option ) )
1637       {
1638         if ( sscanf( val, "%u", & wordcount ) != 1 )
1639           throw exBadFieldInIfo( option );
1640       }
1641       else
1642       if ( char const * val = beginsWith( "synwordcount=", option ) )
1643       {
1644         if ( sscanf( val, "%u", & synwordcount ) != 1 )
1645           throw exBadFieldInIfo( option );
1646       }
1647       else
1648       if ( char const * val = beginsWith( "idxfilesize=", option ) )
1649       {
1650         if ( sscanf( val, "%u", & idxfilesize ) != 1 )
1651           throw exBadFieldInIfo( option );
1652       }
1653       else
1654       if ( char const * val = beginsWith( "idxoffsetbits=", option ) )
1655       {
1656         if ( sscanf( val, "%u", & idxoffsetbits ) != 1 || ( idxoffsetbits != 32
1657              && idxoffsetbits != 64 ) )
1658           throw exBadFieldInIfo( option );
1659       }
1660       else
1661       if ( char const * val = beginsWith( "sametypesequence=", option ) )
1662         sametypesequence = val;
1663       else
1664       if ( char const * val = beginsWith( "dicttype=", option ) )
1665         dicttype = val;
1666       else
1667       if ( char const * val = beginsWith( "description=", option ) )
1668         description = val;
1669       else
1670       if ( char const * val = beginsWith( "copyright=", option ) )
1671         copyright = val;
1672       else
1673       if ( char const * val = beginsWith( "author=", option ) )
1674         author = val;
1675       else
1676       if ( char const * val = beginsWith( "email=", option ) )
1677         email = val;
1678       else
1679       if ( char const * val = beginsWith( "website=", option ) )
1680         website = val;
1681       else
1682       if ( char const * val = beginsWith( "date=", option ) )
1683         date = val;
1684     }
1685   }
1686   catch( File::exReadError & )
1687   {
1688   }
1689 }
1690 
1691 //// StardictDictionary::getResource()
1692 
1693 class StardictResourceRequest;
1694 
1695 class StardictResourceRequestRunnable: public QRunnable
1696 {
1697   StardictResourceRequest & r;
1698   QSemaphore & hasExited;
1699 
1700 public:
1701 
1702   StardictResourceRequestRunnable( StardictResourceRequest & r_,
1703                                QSemaphore & hasExited_ ): r( r_ ),
1704                                                           hasExited( hasExited_ )
1705   {}
1706 
1707   ~StardictResourceRequestRunnable()
1708   {
1709     hasExited.release();
1710   }
1711 
1712   virtual void run();
1713 };
1714 
1715 class StardictResourceRequest: public Dictionary::DataRequest
1716 {
1717   friend class StardictResourceRequestRunnable;
1718 
1719   StardictDictionary & dict;
1720 
1721   string resourceName;
1722 
1723   QAtomicInt isCancelled;
1724   QSemaphore hasExited;
1725 
1726 public:
1727 
1728   StardictResourceRequest( StardictDictionary & dict_,
1729                       string const & resourceName_ ):
1730     dict( dict_ ),
1731     resourceName( resourceName_ )
1732   {
1733     QThreadPool::globalInstance()->start(
1734       new StardictResourceRequestRunnable( *this, hasExited ) );
1735   }
1736 
1737   void run(); // Run from another thread by StardictResourceRequestRunnable
1738 
1739   virtual void cancel()
1740   {
1741     isCancelled.ref();
1742   }
1743 
1744   ~StardictResourceRequest()
1745   {
1746     isCancelled.ref();
1747     hasExited.acquire();
1748   }
1749 };
1750 
1751 void StardictResourceRequestRunnable::run()
1752 {
1753   r.run();
1754 }
1755 
1756 void StardictResourceRequest::run()
1757 {
1758   // Some runnables linger enough that they are cancelled before they start
1759   if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1760   {
1761     finish();
1762     return;
1763   }
1764 
1765   try
1766   {
1767     if( resourceName.at( 0 ) == '\x1E' )
1768       resourceName = resourceName.erase( 0, 1 );
1769     if( resourceName.at( resourceName.length() - 1 ) == '\x1F' )
1770       resourceName.erase( resourceName.length() - 1, 1 );
1771 
1772     string n =
1773       FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) +
1774       FsEncoding::separator() +
1775       "res" +
1776       FsEncoding::separator() +
1777       FsEncoding::encode( resourceName );
1778 
1779     GD_DPRINTF( "n is %s\n", n.c_str() );
1780 
1781     try
1782     {
1783       Mutex::Lock _( dataMutex );
1784 
1785       File::loadFromFile( n, data );
1786     }
1787     catch( File::exCantOpen & )
1788     {
1789       // Try reading from zip file
1790 
1791       if ( dict.resourceZip.isOpen() )
1792       {
1793         Mutex::Lock _( dict.resourceZipMutex );
1794 
1795         Mutex::Lock __( dataMutex );
1796 
1797         if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) )
1798           throw; // Make it fail since we couldn't read the archive
1799       }
1800       else
1801         throw;
1802     }
1803 
1804     if ( Filetype::isNameOfTiff( resourceName ) )
1805     {
1806       // Convert it
1807 
1808       dataMutex.lock();
1809 
1810       QImage img = QImage::fromData( (unsigned char *) &data.front(),
1811                                      data.size() );
1812 
1813 #ifdef MAKE_EXTRA_TIFF_HANDLER
1814       if( img.isNull() )
1815         GdTiff::tiffToQImage( &data.front(), data.size(), img );
1816 #endif
1817 
1818       dataMutex.unlock();
1819 
1820       if ( !img.isNull() )
1821       {
1822         // Managed to load -- now store it back as BMP
1823 
1824         QByteArray ba;
1825         QBuffer buffer( &ba );
1826         buffer.open( QIODevice::WriteOnly );
1827         img.save( &buffer, "BMP" );
1828 
1829         Mutex::Lock _( dataMutex );
1830 
1831         data.resize( buffer.size() );
1832 
1833         memcpy( &data.front(), buffer.data(), data.size() );
1834       }
1835     }
1836 
1837     if( Filetype::isNameOfCSS( resourceName ) )
1838     {
1839       Mutex::Lock _( dataMutex );
1840 
1841       QString css = QString::fromUtf8( data.data(), data.size() );
1842 
1843       // Correct some url's
1844 
1845       QString id = QString::fromUtf8( dict.getId().c_str() );
1846       int pos = 0;
1847 
1848 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
1849       QRegularExpression links( "url\\(\\s*(['\"]?)([^'\"]*)(['\"]?)\\s*\\)",
1850                                 QRegularExpression::CaseInsensitiveOption );
1851 
1852       QString newCSS;
1853       QRegularExpressionMatchIterator it = links.globalMatch( css );
1854       while( it.hasNext() )
1855       {
1856         QRegularExpressionMatch match = it.next();
1857         newCSS += css.midRef( pos, match.capturedStart() - pos );
1858         pos = match.capturedEnd();
1859 
1860         QString url = match.captured( 2 );
1861 
1862         if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0)
1863         {
1864           // External link
1865           newCSS += match.captured();
1866           continue;
1867         }
1868 
1869         QString newUrl = QString( "url(" ) + match.captured( 1 ) + "bres://"
1870                                            + id + "/" + url + match.captured( 3 ) + ")";
1871         newCSS += newUrl;
1872       }
1873       if( pos )
1874       {
1875         newCSS += css.midRef( pos );
1876         css = newCSS;
1877         newCSS.clear();
1878       }
1879 #else
1880       QRegExp links( "url\\(\\s*(['\"]?)([^'\"]*)(['\"]?)\\s*\\)", Qt::CaseInsensitive, QRegExp::RegExp );
1881       for( ; ; )
1882       {
1883         pos = links.indexIn( css, pos );
1884         if( pos < 0 )
1885           break;
1886         QString url = links.cap( 2 );
1887 
1888         if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0)
1889         {
1890           // External link
1891           pos += links.cap().size();
1892           continue;
1893         }
1894 
1895         QString newUrl = QString( "url(" ) + links.cap( 1 ) + "bres://"
1896                                            + id + "/" + url + links.cap( 3 ) + ")";
1897         css.replace( pos, links.cap().size(), newUrl );
1898         pos += newUrl.size();
1899       }
1900 #endif
1901 
1902       dict.isolateCSS( css );
1903       QByteArray bytes = css.toUtf8();
1904       data.resize( bytes.size() );
1905       memcpy( &data.front(), bytes.constData(), bytes.size() );
1906     }
1907 
1908     Mutex::Lock _( dataMutex );
1909     hasAnyData = true;
1910   }
1911   catch( std::exception &ex )
1912   {
1913     gdWarning( "Stardict: Failed loading resource \"%s\" for \"%s\", reason: %s\n",
1914               resourceName.c_str(), dict.getName().c_str(), ex.what() );
1915     // Resource not loaded -- we don't set the hasAnyData flag then
1916   }
1917   catch( ... )
1918   {
1919   }
1920 
1921   finish();
1922 }
1923 
1924 sptr< Dictionary::DataRequest > StardictDictionary::getResource( string const & name )
1925   THROW_SPEC( std::exception )
1926 {
1927   return new StardictResourceRequest( *this, name );
1928 }
1929 
1930 } // anonymous namespace
1931 
findCorrespondingFiles(string const & ifo,string & idx,string & dict,string & syn)1932 static void findCorrespondingFiles( string const & ifo,
1933                                     string & idx, string & dict, string & syn )
1934 {
1935   string base( ifo, 0, ifo.size() - 3 );
1936 
1937   if ( !(
1938           File::tryPossibleName( base + "idx", idx ) ||
1939           File::tryPossibleName( base + "idx.gz", idx ) ||
1940           File::tryPossibleName( base + "idx.dz", idx ) ||
1941           File::tryPossibleName( base + "IDX", idx ) ||
1942           File::tryPossibleName( base + "IDX.GZ", idx ) ||
1943           File::tryPossibleName( base + "IDX.DZ", idx )
1944       ) )
1945     throw exNoIdxFile( ifo );
1946 
1947   if ( !(
1948           File::tryPossibleName( base + "dict", dict ) ||
1949           File::tryPossibleName( base + "dict.dz", dict ) ||
1950           File::tryPossibleName( base + "DICT", dict ) ||
1951           File::tryPossibleName( base + "dict.DZ", dict )
1952       ) )
1953     throw exNoDictFile( ifo );
1954 
1955   if ( !(
1956          File::tryPossibleName( base + "syn", syn ) ||
1957          File::tryPossibleName( base + "syn.gz", syn ) ||
1958          File::tryPossibleName( base + "syn.dz", syn ) ||
1959          File::tryPossibleName( base + "SYN", syn ) ||
1960          File::tryPossibleName( base + "SYN.GZ", syn ) ||
1961          File::tryPossibleName( base + "SYN.DZ", syn )
1962      ) )
1963     syn.clear();
1964 }
1965 
handleIdxSynFile(string const & fileName,IndexedWords & indexedWords,ChunkedStorage::Writer & chunks,vector<uint32_t> * articleOffsets,bool isSynFile,bool parseHeadwords)1966 static void handleIdxSynFile( string const & fileName,
1967                               IndexedWords & indexedWords,
1968                               ChunkedStorage::Writer & chunks,
1969                               vector< uint32_t > * articleOffsets,
1970                               bool isSynFile, bool parseHeadwords )
1971 {
1972   gzFile stardictIdx = gd_gzopen( fileName.c_str() );
1973   if ( !stardictIdx )
1974     throw exCantReadFile( fileName );
1975 
1976   vector< char > image;
1977 
1978   for( ; ; )
1979   {
1980     size_t oldSize = image.size();
1981 
1982     image.resize( oldSize + 65536 );
1983 
1984     int rd = gzread( stardictIdx, &image.front() + oldSize, 65536 );
1985 
1986     if ( rd < 0 )
1987     {
1988       gzclose( stardictIdx );
1989       throw exCantReadFile( fileName );
1990     }
1991 
1992     if ( rd != 65536 )
1993     {
1994       image.resize( oldSize + rd + 1 );
1995       break;
1996     }
1997   }
1998   gzclose( stardictIdx );
1999 
2000   // We append one zero byte to catch runaway string at the end, if any
2001 
2002   image.back() = 0;
2003 
2004   // Now parse it
2005 
2006   for( char const * ptr = &image.front(); ptr != &image.back(); )
2007   {
2008     size_t wordLen = strlen( ptr );
2009 
2010     if ( ptr + wordLen + 1 + ( isSynFile ? sizeof( uint32_t ) :
2011                                            sizeof( uint32_t ) * 2 ) >
2012          &image.back() )
2013     {
2014       GD_FDPRINTF( stderr, "Warning: sudden end of file %s\n", fileName.c_str() );
2015       break;
2016     }
2017 
2018     char const * word = ptr;
2019 
2020     ptr += wordLen + 1;
2021 
2022     uint32_t offset;
2023 
2024     if( strstr( word, "&#" ) )
2025     {
2026       // Decode some html-coded symbols in headword
2027       string unescapedWord = Html::unescapeUtf8( word );
2028       strncpy( (char *)word, unescapedWord.c_str(), wordLen );
2029       wordLen = strlen( word );
2030     }
2031 
2032     if ( !isSynFile )
2033     {
2034       // We're processing the .idx file
2035       uint32_t articleOffset, articleSize;
2036 
2037       memcpy( &articleOffset, ptr, sizeof( uint32_t ) );
2038       ptr += sizeof( uint32_t );
2039       memcpy( &articleSize, ptr, sizeof( uint32_t ) );
2040       ptr += sizeof( uint32_t );
2041 
2042       articleOffset = ntohl( articleOffset );
2043       articleSize = ntohl( articleSize );
2044 
2045       // Create an entry for the article in the chunked storage
2046 
2047       offset = chunks.startNewBlock();
2048 
2049       if ( articleOffsets )
2050         articleOffsets->push_back( offset );
2051 
2052       chunks.addToBlock( &articleOffset, sizeof( uint32_t ) );
2053       chunks.addToBlock( &articleSize, sizeof( uint32_t ) );
2054       chunks.addToBlock( word, wordLen + 1 );
2055     }
2056     else
2057     {
2058       // We're processing the .syn file
2059       uint32_t offsetInIndex;
2060 
2061       memcpy( &offsetInIndex, ptr, sizeof( uint32_t ) );
2062       ptr += sizeof( uint32_t );
2063 
2064       offsetInIndex = ntohl( offsetInIndex );
2065 
2066       if ( offsetInIndex >= articleOffsets->size() )
2067         throw exIncorrectOffset( fileName );
2068 
2069       offset = (*articleOffsets)[ offsetInIndex ];
2070 
2071       // Some StarDict dictionaries are in fact badly converted Babylon ones.
2072       // They contain a lot of superfluous slashed entries with dollar signs.
2073       // We try to filter them out here, since those entries become much more
2074       // apparent in GoldenDict than they were in StarDict because of
2075       // punctuation folding. Hopefully there are not a whole lot of valid
2076       // synonyms which really start from slash and contain dollar signs, or
2077       // end with dollar and contain slashes.
2078       if ( *word == '/' )
2079       {
2080         if ( strchr( word, '$' ) )
2081           continue; // Skip this entry
2082       }
2083       else
2084       if ( wordLen && word[ wordLen - 1 ] == '$' )
2085       {
2086         if ( strchr( word, '/' ) )
2087           continue; // Skip this entry
2088       }
2089     }
2090 
2091     // Insert new entry into an index
2092 
2093     if( parseHeadwords )
2094       indexedWords.addWord( Utf8::decode( word ), offset );
2095     else
2096       indexedWords.addSingleWord( Utf8::decode( word ), offset );
2097   }
2098 
2099   GD_DPRINTF( "%u entires made\n", (unsigned) indexedWords.size() );
2100 }
2101 
2102 
makeDictionaries(vector<string> const & fileNames,string const & indicesDir,Dictionary::Initializing & initializing,unsigned maxHeadwordsToExpand)2103 vector< sptr< Dictionary::Class > > makeDictionaries(
2104                                       vector< string > const & fileNames,
2105                                       string const & indicesDir,
2106                                       Dictionary::Initializing & initializing,
2107                                       unsigned maxHeadwordsToExpand )
2108   THROW_SPEC( std::exception )
2109 {
2110   vector< sptr< Dictionary::Class > > dictionaries;
2111 
2112   for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
2113        ++i )
2114   {
2115     if ( i->size() < 4 ||
2116         strcasecmp( i->c_str() + ( i->size() - 4 ), ".ifo" ) != 0 )
2117       continue;
2118 
2119     try
2120     {
2121       vector< string > dictFiles( 1, *i );
2122 
2123       string idxFileName, dictFileName, synFileName;
2124 
2125       findCorrespondingFiles( *i, idxFileName, dictFileName, synFileName );
2126 
2127       dictFiles.push_back( idxFileName );
2128       dictFiles.push_back( dictFileName );
2129 
2130       if ( synFileName.size() )
2131         dictFiles.push_back( synFileName );
2132 
2133       // See if there's a zip file with resources present. If so, include it.
2134 
2135       string zipFileName;
2136       string baseName = FsEncoding::dirname( idxFileName ) + FsEncoding::separator();
2137 
2138       if ( File::tryPossibleZipName( baseName + "res.zip", zipFileName ) ||
2139            File::tryPossibleZipName( baseName + "RES.ZIP", zipFileName ) ||
2140            File::tryPossibleZipName( baseName + "res" + FsEncoding::separator() + "res.zip", zipFileName ) )
2141         dictFiles.push_back( zipFileName );
2142 
2143       string dictId = Dictionary::makeDictionaryId( dictFiles );
2144 
2145       string indexFile = indicesDir + dictId;
2146 
2147       if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
2148            indexIsOldOrBad( indexFile ) )
2149       {
2150         // Building the index
2151 
2152         File::Class ifoFile( *i, "r" );
2153 
2154         Ifo ifo( ifoFile );
2155 
2156         gdDebug( "Stardict: Building the index for dictionary: %s\n", ifo.bookname.c_str() );
2157 
2158         if ( ifo.idxoffsetbits == 64 )
2159           throw ex64BitsNotSupported();
2160 
2161         if ( ifo.dicttype.size() )
2162           throw exDicttypeNotSupported();
2163 
2164         if( synFileName.empty() )
2165         {
2166           if ( ifo.synwordcount )
2167           {
2168             GD_DPRINTF( "Warning: dictionary has synwordcount specified, but no "
2169                         "corresponding .syn file was found\n" );
2170             ifo.synwordcount = 0;  // Pretend it wasn't there
2171           }
2172         }
2173         else
2174         if ( !ifo.synwordcount )
2175         {
2176           GD_DPRINTF( "Warning: ignoring .syn file %s, since there's no synwordcount in .ifo specified\n",
2177                       synFileName.c_str() );
2178         }
2179 
2180 
2181         GD_DPRINTF( "bookname = %s\n", ifo.bookname.c_str() );
2182         GD_DPRINTF( "wordcount = %u\n", ifo.wordcount );
2183 
2184         initializing.indexingDictionary( ifo.bookname );
2185 
2186         File::Class idx( indexFile, "wb" );
2187 
2188         IdxHeader idxHeader;
2189 
2190         memset( &idxHeader, 0, sizeof( idxHeader ) );
2191 
2192         // We write a dummy header first. At the end of the process the header
2193         // will be rewritten with the right values.
2194 
2195         idx.write( idxHeader );
2196 
2197         idx.write( ifo.bookname.data(), ifo.bookname.size() );
2198         idx.write( ifo.sametypesequence.data(), ifo.sametypesequence.size() );
2199 
2200         IndexedWords indexedWords;
2201 
2202         ChunkedStorage::Writer chunks( idx );
2203 
2204         // Load indices
2205         if ( !ifo.synwordcount )
2206           handleIdxSynFile( idxFileName, indexedWords, chunks, 0, false,
2207                             !maxHeadwordsToExpand || ifo.wordcount < maxHeadwordsToExpand );
2208         else
2209         {
2210           vector< uint32_t > articleOffsets;
2211 
2212           articleOffsets.reserve( ifo.wordcount );
2213 
2214           handleIdxSynFile( idxFileName, indexedWords, chunks, &articleOffsets,
2215                             false,
2216                             !maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand );
2217 
2218           handleIdxSynFile( synFileName, indexedWords, chunks, &articleOffsets,
2219                             true,
2220                             !maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand );
2221         }
2222 
2223         // Finish with the chunks
2224 
2225         idxHeader.chunksOffset = chunks.finish();
2226 
2227         // Build index
2228 
2229         IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
2230 
2231         idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
2232         idxHeader.indexRootOffset = idxInfo.rootOffset;
2233 
2234         // That concludes it. Update the header.
2235 
2236         idxHeader.signature = Signature;
2237         idxHeader.formatVersion = CurrentFormatVersion;
2238 
2239         idxHeader.wordCount = ifo.wordcount;
2240         idxHeader.synWordCount = ifo.synwordcount;
2241         idxHeader.bookNameSize = ifo.bookname.size();
2242         idxHeader.sameTypeSequenceSize = ifo.sametypesequence.size();
2243 
2244         // read languages
2245         QPair<quint32,quint32> langs =
2246             LangCoder::findIdsForFilename( QString::fromStdString( dictFileName ) );
2247 
2248         // if no languages found, try dictionary's name
2249         if ( langs.first == 0 || langs.second == 0 )
2250         {
2251           langs =
2252             LangCoder::findIdsForFilename( QString::fromStdString( ifo.bookname ) );
2253         }
2254 
2255         idxHeader.langFrom = langs.first;
2256         idxHeader.langTo = langs.second;
2257 
2258         // If there was a zip file, index it too
2259 
2260         if ( zipFileName.size() )
2261         {
2262           GD_DPRINTF( "Indexing zip file\n" );
2263 
2264           idxHeader.hasZipFile = 1;
2265 
2266           IndexedWords zipFileNames;
2267           IndexedZip zipFile;
2268           if( zipFile.openZipFile( QDir::fromNativeSeparators(
2269                                    FsEncoding::decode( zipFileName.c_str() ) ) ) )
2270               zipFile.indexFile( zipFileNames );
2271 
2272           if( !zipFileNames.empty() )
2273           {
2274             // Build the resulting zip file index
2275 
2276             IndexInfo idxInfo = BtreeIndexing::buildIndex( zipFileNames, idx );
2277 
2278             idxHeader.zipIndexBtreeMaxElements = idxInfo.btreeMaxElements;
2279             idxHeader.zipIndexRootOffset = idxInfo.rootOffset;
2280           }
2281           else
2282           {
2283             // Bad zip file -- no index (though the mark that we have one
2284             // remains)
2285             idxHeader.zipIndexBtreeMaxElements = 0;
2286             idxHeader.zipIndexRootOffset = 0;
2287           }
2288         }
2289         else
2290           idxHeader.hasZipFile = 0;
2291 
2292         // That concludes it. Update the header.
2293 
2294         idx.rewind();
2295 
2296         idx.write( &idxHeader, sizeof( idxHeader ) );
2297       }
2298 
2299       dictionaries.push_back( new StardictDictionary( dictId,
2300                                                       indexFile,
2301                                                       dictFiles ) );
2302     }
2303     catch( std::exception & e )
2304     {
2305       gdWarning( "Stardict dictionary initializing failed: %s, error: %s\n",
2306                  i->c_str(), e.what() );
2307     }
2308   }
2309 
2310   return dictionaries;
2311 }
2312 
2313 
2314 }
2315