1 /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
3
4 #include "stardict.hh"
5 #include "btreeidx.hh"
6 #include "folding.hh"
7 #include "utf8.hh"
8 #include "chunkedstorage.hh"
9 #include "dictzip.h"
10 #include "xdxf2html.hh"
11 #include "htmlescape.hh"
12 #include "langcoder.hh"
13 #include "gddebug.hh"
14 #include "fsencoding.hh"
15 #include "filetype.hh"
16 #include "indexedzip.hh"
17 #include "tiff.hh"
18 #include "ftshelpers.hh"
19 #include "wstring_qt.hh"
20 #include "audiolink.hh"
21
22 #include <zlib.h>
23 #include <map>
24 #include <set>
25 #include <string>
26 #ifndef __WIN32
27 #include <arpa/inet.h>
28 #else
29 #include <winsock.h>
30 #endif
31 #include <stdlib.h>
32
33 #ifdef _MSC_VER
34 #include <stub_msvc.h>
35 #endif
36
37 #include <QString>
38 #include <QSemaphore>
39 #include <QThreadPool>
40 #include <QAtomicInt>
41 #include <QDebug>
42 #include <QRegExp>
43 #include <QStringList>
44 #include <QDomDocument>
45 #include <QDomNode>
46 #include "ufile.hh"
47 #include "qt4x5.hh"
48
49 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
50 #include <QRegularExpression>
51 #endif
52
53 namespace Stardict {
54
55 using std::map;
56 using std::multimap;
57 using std::pair;
58 using std::set;
59 using std::string;
60 using gd::wstring;
61
62 using BtreeIndexing::WordArticleLink;
63 using BtreeIndexing::IndexedWords;
64 using BtreeIndexing::IndexInfo;
65
66 namespace {
67
68 DEF_EX( exNotAnIfoFile, "Not an .ifo file", Dictionary::Ex )
69 DEF_EX_STR( exBadFieldInIfo, "Bad field in .ifo file encountered:", Dictionary::Ex )
70 DEF_EX_STR( exNoIdxFile, "No corresponding .idx file was found for", Dictionary::Ex )
71 DEF_EX_STR( exNoDictFile, "No corresponding .dict file was found for", Dictionary::Ex )
72 DEF_EX_STR( exNoSynFile, "No corresponding .syn file was found for", Dictionary::Ex )
73
74 DEF_EX( ex64BitsNotSupported, "64-bit indices are not presently supported, sorry", Dictionary::Ex )
75 DEF_EX( exDicttypeNotSupported, "Dictionaries with dicttypes are not supported, sorry", Dictionary::Ex )
76
77 DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex )
78 DEF_EX_STR( exWordIsTooLarge, "Enountered a word that is too large:", Dictionary::Ex )
79 DEF_EX_STR( exSuddenEndOfFile, "Sudden end of file", Dictionary::Ex )
80 DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex )
81
82 DEF_EX_STR( exIncorrectOffset, "Incorrect offset encountered in file", Dictionary::Ex )
83
84 /// Contents of an ifo file
85 struct Ifo
86 {
87 string version;
88 string bookname;
89 uint32_t wordcount, synwordcount, idxfilesize, idxoffsetbits;
90 string sametypesequence, dicttype, description;
91 string copyright, author, email, website, date;
92
93 Ifo( File::Class & );
94 };
95
96 enum
97 {
98 Signature = 0x58444953, // SIDX on little-endian, XDIS on big-endian
99 CurrentFormatVersion = 9 + BtreeIndexing::FormatVersion + Folding::Version
100 };
101
102 struct IdxHeader
103 {
104 uint32_t signature; // First comes the signature, SIDX
105 uint32_t formatVersion; // File format version (CurrentFormatVersion)
106 uint32_t chunksOffset; // The offset to chunks' storage
107 uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
108 uint32_t indexRootOffset;
109 uint32_t wordCount; // Saved from Ifo::wordcount
110 uint32_t synWordCount; // Saved from Ifo::synwordcount
111 uint32_t bookNameSize; // Book name's length. Used to read it then.
112 uint32_t sameTypeSequenceSize; // That string's size. Used to read it then.
113 uint32_t langFrom; // Source language
114 uint32_t langTo; // Target language
115 uint32_t hasZipFile; // Non-zero means there's a zip file with resources present
116 uint32_t zipIndexBtreeMaxElements; // Two fields from IndexInfo of the zip
117 // resource index.
118 uint32_t zipIndexRootOffset;
119 }
120 #ifndef _MSC_VER
121 __attribute__((packed))
122 #endif
123 ;
124
indexIsOldOrBad(string const & indexFile)125 bool indexIsOldOrBad( string const & indexFile )
126 {
127 File::Class idx( indexFile, "rb" );
128
129 IdxHeader header;
130
131 return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
132 header.signature != Signature ||
133 header.formatVersion != CurrentFormatVersion;
134 }
135
136 class StardictDictionary: public BtreeIndexing::BtreeDictionary
137 {
138 Mutex idxMutex;
139 File::Class idx;
140 IdxHeader idxHeader;
141 string bookName;
142 string sameTypeSequence;
143 ChunkedStorage::Reader chunks;
144 Mutex dzMutex;
145 dictData * dz;
146 Mutex resourceZipMutex;
147 IndexedZip resourceZip;
148
149 public:
150
151 StardictDictionary( string const & id, string const & indexFile,
152 vector< string > const & dictionaryFiles );
153
154 ~StardictDictionary();
155
getName()156 virtual string getName() throw()
157 { return bookName; }
158
getProperties()159 virtual map< Dictionary::Property, string > getProperties() throw()
160 { return map< Dictionary::Property, string >(); }
161
getArticleCount()162 virtual unsigned long getArticleCount() throw()
163 { return idxHeader.wordCount; }
164
getWordCount()165 virtual unsigned long getWordCount() throw()
166 { return idxHeader.wordCount + idxHeader.synWordCount; }
167
getLangFrom() const168 inline virtual quint32 getLangFrom() const
169 { return idxHeader.langFrom; }
170
getLangTo() const171 inline virtual quint32 getLangTo() const
172 { return idxHeader.langTo; }
173
174 virtual sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & )
175 THROW_SPEC( std::exception );
176
177 virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
178 vector< wstring > const & alts,
179 wstring const &,
180 bool ignoreDiacritics )
181 THROW_SPEC( std::exception );
182
183 virtual sptr< Dictionary::DataRequest > getResource( string const & name )
184 THROW_SPEC( std::exception );
185
186 virtual QString const& getDescription();
187
188 virtual QString getMainFilename();
189
190 virtual sptr< Dictionary::DataRequest > getSearchResults( QString const & searchString,
191 int searchMode, bool matchCase,
192 int distanceBetweenWords,
193 int maxResults,
194 bool ignoreWordsOrder,
195 bool ignoreDiacritics );
196 virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
197
198 virtual void makeFTSIndex(QAtomicInt & isCancelled, bool firstIteration );
199
setFTSParameters(Config::FullTextSearch const & fts)200 virtual void setFTSParameters( Config::FullTextSearch const & fts )
201 {
202 can_FTS = fts.enabled
203 && !fts.disabledTypes.contains( "STARDICT", Qt::CaseInsensitive )
204 && ( fts.maxDictionarySize == 0 || getArticleCount() <= fts.maxDictionarySize );
205 }
206 protected:
207
208 void loadIcon() throw();
209
210 private:
211
212 /// Retrieves the article's offset/size in .dict file, and its headword.
213 void getArticleProps( uint32_t articleAddress,
214 string & headword,
215 uint32_t & offset, uint32_t & size );
216
217 /// Loads the article, storing its headword and formatting the data it has
218 /// into an html.
219 void loadArticle( uint32_t address,
220 string & headword,
221 string & articleText );
222
223 string loadString( size_t size );
224
225 string handleResource( char type, char const * resource, size_t size );
226
227 void pangoToHtml( QString & text );
228
229 friend class StardictResourceRequest;
230 friend class StardictArticleRequest;
231 friend class StardictHeadwordsRequest;
232 };
233
StardictDictionary(string const & id,string const & indexFile,vector<string> const & dictionaryFiles)234 StardictDictionary::StardictDictionary( string const & id,
235 string const & indexFile,
236 vector< string > const & dictionaryFiles ):
237 BtreeDictionary( id, dictionaryFiles ),
238 idx( indexFile, "rb" ),
239 idxHeader( idx.read< IdxHeader >() ),
240 bookName( loadString( idxHeader.bookNameSize ) ),
241 sameTypeSequence( loadString( idxHeader.sameTypeSequenceSize ) ),
242 chunks( idx, idxHeader.chunksOffset )
243 {
244 // Open the .dict file
245
246 DZ_ERRORS error;
247 dz = dict_data_open( dictionaryFiles[ 2 ].c_str(), &error, 0 );
248
249 if ( !dz )
250 throw exDictzipError( string( dz_error_str( error ) )
251 + "(" + dictionaryFiles[ 2 ] + ")" );
252
253 // Initialize the index
254
255 openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
256 idxHeader.indexRootOffset ),
257 idx, idxMutex );
258
259 // Open a resource zip file, if there's one
260
261 if ( idxHeader.hasZipFile &&
262 ( idxHeader.zipIndexBtreeMaxElements ||
263 idxHeader.zipIndexRootOffset ) )
264 {
265 resourceZip.openIndex( IndexInfo( idxHeader.zipIndexBtreeMaxElements,
266 idxHeader.zipIndexRootOffset ),
267 idx, idxMutex );
268
269 QString zipName = QDir::fromNativeSeparators(
270 FsEncoding::decode( getDictionaryFilenames().back().c_str() ) );
271
272 if ( zipName.endsWith( ".zip", Qt::CaseInsensitive ) ) // Sanity check
273 resourceZip.openZipFile( zipName );
274 }
275
276 // Full-text search parameters
277
278 can_FTS = true;
279
280 ftsIdxName = indexFile + "_FTS";
281
282 if( !Dictionary::needToRebuildIndex( dictionaryFiles, ftsIdxName )
283 && !FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) )
284 FTS_index_completed.ref();
285 }
286
~StardictDictionary()287 StardictDictionary::~StardictDictionary()
288 {
289 if ( dz )
290 dict_data_close( dz );
291 }
292
loadIcon()293 void StardictDictionary::loadIcon() throw()
294 {
295 if ( dictionaryIconLoaded )
296 return;
297
298 QString fileName =
299 QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
300
301 // Remove the extension
302 fileName.chop( 3 );
303
304 if( !loadIconFromFile( fileName ) )
305 {
306 // Load failed -- use default icons
307 dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_stardict.png");
308 }
309
310 dictionaryIconLoaded = true;
311 }
312
loadString(size_t size)313 string StardictDictionary::loadString( size_t size )
314 {
315 if( size == 0 )
316 return string();
317
318 vector< char > data( size );
319
320 idx.read( &data.front(), data.size() );
321
322 return string( &data.front(), data.size() );
323 }
324
getArticleProps(uint32_t articleAddress,string & headword,uint32_t & offset,uint32_t & size)325 void StardictDictionary::getArticleProps( uint32_t articleAddress,
326 string & headword,
327 uint32_t & offset, uint32_t & size )
328 {
329 vector< char > chunk;
330
331 Mutex::Lock _( idxMutex );
332
333 char * articleData = chunks.getBlock( articleAddress, chunk );
334
335 memcpy( &offset, articleData, sizeof( uint32_t ) );
336 articleData += sizeof( uint32_t );
337 memcpy( &size, articleData, sizeof( uint32_t ) );
338 articleData += sizeof( uint32_t );
339
340 headword = articleData;
341 }
342
343 class PowerWordDataProcessor{
344 class PWSyntaxTranslate{
345 public:
PWSyntaxTranslate(const char * re,const char * replacement)346 PWSyntaxTranslate(const char* re, const char* replacement)
347 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
348 : _re(re, QRegularExpression::UseUnicodePropertiesOption )
349 #else
350 : _re(re)
351 #endif
352 , _replacement(replacement)
353 {
354 }
355 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
re() const356 const QRegularExpression & re() const {
357 #else
358 const QRegExp& re() const {
359 #endif
360 return _re;
361 }
362 const QString & replacement() const {
363 return _replacement;
364 }
365 private:
366 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
367 QRegularExpression _re;
368 #else
369 QRegExp _re;
370 #endif
371 QString _replacement;
372 };
373 public:
PowerWordDataProcessor(const char * resource,size_t size)374 PowerWordDataProcessor(const char* resource, size_t size)
375 : _data(QString::fromUtf8(resource, size))
376 {
377 }
378
process()379 string process() {
380 QDomDocument doc;
381 QString ss;
382 ss = "<div class=\"sdct_k\">";
383 if (!doc.setContent(_data)) {
384 ss += _data ;
385 } else {
386 QStringList sl;
387 walkNode(doc.firstChild(), sl);
388
389 QStringListIterator itr(sl);
390 while (itr.hasNext()) {
391 QString s = itr.next();
392 translatePW(s);
393 ss += s;
394 ss += "<br>";
395 }
396 }
397 ss += "</div>";
398 QByteArray ba = ss.toUtf8();
399 return string(ba.data(), ba.size());
400 }
401 private:
walkNode(const QDomNode & e,QStringList & sl)402 void walkNode(const QDomNode& e, QStringList& sl) {
403 if (e.isNull()) {
404 return;
405 }
406 if (e.isText()) {
407 sl.append(e.toText().data());
408 } else {
409 QDomNodeList l = e.childNodes();
410 for (int i = 0; i < l.size(); ++i) {
411 QDomNode n = l.at(i);
412 if (n.isText()) {
413 sl.append(n.toText().data());
414 } else {
415 walkNode(n, sl);
416 }
417 }
418 }
419 }
420
translatePW(QString & s)421 void translatePW(QString& s){
422 const int TRANSLATE_TBL_SIZE=5;
423 static PWSyntaxTranslate t[TRANSLATE_TBL_SIZE]={
424 PWSyntaxTranslate("&[bB]\\s*\\{([^\\{}&]+)\\}", "<B>\\1</B>"),
425 PWSyntaxTranslate("&[iI]\\s*\\{([^\\{}&]+)\\}", "<I>\\1</I>"),
426 PWSyntaxTranslate("&[uU]\\s*\\{([^\\{}&]+)\\}", "<U>\\1</U>"),
427 PWSyntaxTranslate("&[lL]\\s*\\{([^\\{}&]+)\\}", "<SPAN style=\"color:#0000ff\">\\1</SPAN>"),
428 PWSyntaxTranslate("&[2]\\s*\\{([^\\{}&]+)\\}", "<SPAN style=\"color:#0000ff\">\\1</SPAN>")
429 };
430
431 QString old;
432 while (s.compare(old) != 0) {
433 for (int i = 0; i < TRANSLATE_TBL_SIZE; ++i) {
434 PWSyntaxTranslate& a = t[i];
435 s.replace(a.re(), a.replacement());
436 }
437 old = s;
438 }
439 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
440 s.replace(QRegularExpression( "&.\\s*\\{",
441 QRegularExpression::UseUnicodePropertiesOption
442 | QRegularExpression::DotMatchesEverythingOption),
443 "");
444 #else
445 s.replace(QRegExp("&.\\s*\\{"), "");
446 #endif
447 s.replace("}", "");
448 }
449 private:
450 QString _data;
451 };
452
453
454 /// This function tries to make an html of the Stardict's resource typed
455 /// 'type', contained in a block pointed to by 'resource', 'size' bytes long.
handleResource(char type,char const * resource,size_t size)456 string StardictDictionary::handleResource( char type, char const * resource, size_t size )
457 {
458 QString text;
459 switch( type )
460 {
461 case 'x': // Xdxf content
462 return Xdxf2Html::convert( string( resource, size ), Xdxf2Html::STARDICT, NULL, this, &resourceZip );
463 case 'h': // Html content
464 {
465 QString articleText = QString( "<div class=\"sdct_h\">" ) + QString::fromUtf8( resource, size ) + "</div>";
466
467 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
468 QRegularExpression imgRe( "(<\\s*img\\s+[^>]*src\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)",
469 QRegularExpression::CaseInsensitiveOption
470 | QRegularExpression::InvertedGreedinessOption );
471 QRegularExpression linkRe( "(<\\s*link\\s+[^>]*href\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)",
472 QRegularExpression::CaseInsensitiveOption
473 | QRegularExpression::InvertedGreedinessOption );
474 #else
475 QRegExp imgRe( "(<\\s*img\\s+[^>]*src\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)", Qt::CaseInsensitive );
476 imgRe.setMinimal( true );
477 QRegExp linkRe( "(<\\s*link\\s+[^>]*href\\s*=\\s*[\"']+)(?!(?:data|https?|ftp):)", Qt::CaseInsensitive );
478 linkRe.setMinimal( true );
479 #endif
480
481 articleText.replace( imgRe , "\\1bres://" + QString::fromStdString( getId() ) + "/" )
482 .replace( linkRe, "\\1bres://" + QString::fromStdString( getId() ) + "/" );
483
484 // Handle links to articles
485
486 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
487 QRegularExpression linksReg( "<a(\\s*[^>]*)href\\s*=\\s*['\"](bword://)?([^'\"]+)['\"]",
488 QRegularExpression::CaseInsensitiveOption );
489 #else
490 QRegExp linksReg( "<a(\\s*[^>]*)href\\s*=\\s*['\"](bword://)?([^'\"]+)['\"]", Qt::CaseInsensitive );
491 linksReg.setMinimal( true );
492 #endif
493
494 int pos = 0;
495 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
496 QString articleNewText;
497 QRegularExpressionMatchIterator it = linksReg.globalMatch( articleText );
498 while( it.hasNext() )
499 {
500 QRegularExpressionMatch match = it.next();
501 articleNewText += articleText.midRef( pos, match.capturedStart() - pos );
502 pos = match.capturedEnd();
503
504 QString link = match.captured( 3 );
505 #else
506 while( pos >= 0 )
507 {
508 pos = linksReg.indexIn( articleText, pos );
509 if( pos < 0 )
510 break;
511
512 QString link = linksReg.cap( 3 );
513 #endif
514 if( link.indexOf( ':' ) < 0 )
515 {
516 QString newLink;
517 if( link.indexOf( '#' ) < 0 )
518 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
519 newLink = QString( "<a" ) + match.captured( 1 ) + "href=\"bword:" + link + "\"";
520 #else
521 newLink = QString( "<a" ) + linksReg.cap( 1 ) + "href=\"bword:" + link + "\"";
522 #endif
523
524 // Anchors
525
526 if( link.indexOf( '#' ) > 0 )
527 {
528 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
529 newLink = QString( "<a" ) + match.captured( 1 ) + "href=\"gdlookup://localhost/" + link + "\"";
530 #else
531 newLink = QString( "<a" ) + linksReg.cap( 1 ) + "href=\"gdlookup://localhost/" + link + "\"";
532 #endif
533 newLink.replace( "#", "?gdanchor=" );
534 }
535
536 if( !newLink.isEmpty() )
537 {
538 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
539 articleNewText += newLink;
540 #else
541 articleText.replace( pos, linksReg.cap( 0 ).size(), newLink );
542 pos += newLink.size();
543 #endif
544 }
545 else
546 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
547 articleNewText += match.captured();
548 #else
549 pos += linksReg.cap( 0 ).size();
550 #endif
551 }
552 else
553 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
554 articleNewText += match.captured();
555 }
556 if( pos )
557 {
558 articleNewText += articleText.midRef( pos );
559 articleText = articleNewText;
560 articleNewText.clear();
561 }
562 #else
563 pos += linksReg.cap( 0 ).size();
564 }
565 #endif
566
567 // Handle "audio" tags
568
569 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
570 QRegularExpression audioRe( "<\\s*audio\\s*src\\s*=\\s*([\"']+)([^\"']+)([\"'])\\s*>(.*)</audio>",
571 QRegularExpression::CaseInsensitiveOption
572 | QRegularExpression::DotMatchesEverythingOption
573 | QRegularExpression::InvertedGreedinessOption );
574 #else
575 QRegExp audioRe( "<\\s*audio\\s*src\\s*=\\s*([\"']+)([^\"']+)([\"'])\\s*>(.*)</audio>", Qt::CaseInsensitive );
576 audioRe.setMinimal( true );
577 #endif
578
579 pos = 0;
580
581 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
582 it = audioRe.globalMatch( articleText );
583 while( it.hasNext() )
584 {
585 QRegularExpressionMatch match = it.next();
586 articleNewText += articleText.midRef( pos, match.capturedStart() - pos );
587 pos = match.capturedEnd();
588
589 QString src = match.captured( 2 );
590 #else
591 while( pos >= 0 )
592 {
593 pos = audioRe.indexIn( articleText, pos );
594 if( pos < 0 )
595 break;
596
597 QString src = audioRe.cap( 2 );
598 #endif
599 if( src.indexOf( "://" ) >= 0 )
600 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
601 articleNewText += match.captured();
602 #else
603 pos += audioRe.cap( 0 ).length();
604 #endif
605 else
606 {
607 std::string href = "\"gdau://" + getId() + "/" + src.toUtf8().data() + "\"";
608 QString newTag = QString::fromUtf8( ( addAudioLink( href, getId() ) + "<span class=\"sdict_h_wav\"><a href=" + href + ">" ).c_str() );
609 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
610 newTag += match.captured( 4 );
611 if( match.captured( 4 ).indexOf( "<img " ) < 0 )
612 #else
613 newTag += audioRe.cap( 4 );
614 if( audioRe.cap( 4 ).indexOf( "<img " ) < 0 )
615 #endif
616 newTag += " <img src=\"qrcx://localhost/icons/playsound.png\" border=\"0\" alt=\"Play\">";
617 newTag += "</a></span>";
618
619 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
620 articleNewText += newTag;
621 #else
622 articleText.replace( pos, audioRe.cap( 0 ).length(), newTag );
623 pos += newTag.length();
624 #endif
625 }
626 }
627 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
628 if( pos )
629 {
630 articleNewText += articleText.midRef( pos );
631 articleText = articleNewText;
632 articleNewText.clear();
633 }
634 #endif
635
636 return ( articleText.toUtf8().data() );
637 }
638 case 'm': // Pure meaning, usually means preformatted text
639 return "<div class=\"sdct_m\">" + Html::preformat( string( resource, size ), isToLanguageRTL() ) + "</div>";
640 case 'l': // Same as 'm', but not in utf8, instead in current locale's
641 // encoding.
642 // We just use Qt here, it should know better about system's
643 // locale.
644 return "<div class=\"sdct_l\">" + Html::preformat( QString::fromLocal8Bit( resource, size ).toUtf8().data(),
645 isToLanguageRTL() )
646 + "</div>";
647 case 'g': // Pango markup.
648 text = QString::fromUtf8( resource, size );
649 pangoToHtml( text );
650 return "<div class=\"sdct_g\">" + string( text.toUtf8().data() ) + "</div>";
651 case 't': // Transcription
652 return "<div class=\"sdct_t\">" + Html::escape( string( resource, size ) ) + "</div>";
653 case 'y': // Chinese YinBiao or Japanese KANA. Examples are needed. For now,
654 // just output as pure escaped utf8.
655 return "<div class=\"sdct_y\">" + Html::escape( string( resource, size ) ) + "</div>";
656 case 'k': // KingSoft PowerWord data.
657 {
658 PowerWordDataProcessor pwdp(resource, size);
659 return pwdp.process();
660 }
661 case 'w': // MediaWiki markup. We don't handle this right now.
662 return "<div class=\"sdct_w\">" + Html::escape( string( resource, size ) ) + "</div>";
663 case 'n': // WordNet data. We don't know anything about it.
664 return "<div class=\"sdct_n\">" + Html::escape( string( resource, size ) ) + "</div>";
665
666 case 'r': // Resource file list. For now, resources aren't handled.
667 return "<div class=\"sdct_r\">" + Html::escape( string( resource, size ) ) + "</div>";
668
669 case 'W': // An embedded Wav file. Unhandled yet.
670 return "<div class=\"sdct_W\">(an embedded .wav file)</div>";
671 case 'P': // An embedded picture file. Unhandled yet.
672 return "<div class=\"sdct_P\">(an embedded picture file)</div>";
673 }
674
675 if ( islower( type ) )
676 {
677 return string( "<b>Unknown textual entry type " ) + string( 1, type ) + ":</b> " + Html::escape( string( resource, size ) ) + "<br>";
678 }
679 else
680 return string( "<b>Unknown blob entry type " ) + string( 1, type ) + "</b><br>";
681 }
682
683 void StardictDictionary::pangoToHtml( QString & text )
684 {
685 /*
686 * Partially support for Pango Markup Language
687 * Attributes "fallback", "lang", "gravity", "gravity_hint" just ignored
688 */
689
690 QRegExp spanRegex( "<span\\s*([^>]*)>", Qt::CaseInsensitive );
691 QRegExp styleRegex( "(\\w+)=\"([^\"]*)\"" );
692
693 text.replace( "\n", "<br>" );
694
695 int pos = 0;
696 do
697 {
698 pos = spanRegex.indexIn( text, pos );
699 if( pos >= 0 )
700 {
701 QString styles = spanRegex.cap( 1 );
702 QString newSpan( "<span style=\"" );
703 int stylePos = 0;
704 do
705 {
706 stylePos = styleRegex.indexIn( styles, stylePos );
707 QString style = styleRegex.cap( 1 );
708 if( stylePos >= 0 )
709 {
710 if( style.compare( "font_desc", Qt::CaseInsensitive ) == 0
711 || style.compare( "font", Qt::CaseInsensitive ) == 0 )
712 {
713 // Parse font description
714
715 QStringList list = styleRegex.cap( 2 ).split( " ", QString::SkipEmptyParts );
716 int n;
717 QString sizeStr, stylesStr, familiesStr;
718 for( n = list.size() - 1; n >= 0; n-- )
719 {
720 QString str = list.at( n );
721
722 // font size
723 if( str[ 0 ].isNumber() )
724 {
725 sizeStr = QString( "font-size:" ) + str + ";";
726 continue;
727 }
728
729 // font style
730 if( str.compare( "normal", Qt::CaseInsensitive ) == 0
731 || str.compare( "oblique", Qt::CaseInsensitive ) == 0
732 || str.compare( "italic", Qt::CaseInsensitive ) == 0 )
733 {
734 if( !stylesStr.contains( "font-style:" ) )
735 stylesStr += QString( "font-style:" ) + str + ";";
736 continue;
737 }
738
739 // font variant
740 if( str.compare( "smallcaps", Qt::CaseInsensitive ) == 0 )
741 {
742 stylesStr += QString( "font-variant:small-caps" ) ;
743 continue;
744 }
745
746 // font weight
747 if( str.compare( "ultralight", Qt::CaseInsensitive ) == 0 )
748 {
749 stylesStr += QString( "font-weight:100;" );
750 continue;
751 }
752 if( str.compare( "light", Qt::CaseInsensitive ) == 0 )
753 {
754 stylesStr += QString( "font-weight:200;" );
755 continue;
756 }
757 if( str.compare( "bold", Qt::CaseInsensitive ) == 0 )
758 {
759 stylesStr += QString( "font-weight:bold;" );
760 continue;
761 }
762 if( str.compare( "ultrabold", Qt::CaseInsensitive ) == 0 )
763 {
764 stylesStr += QString( "font-weight:800;" );
765 continue;
766 }
767 if( str.compare( "heavy", Qt::CaseInsensitive ) == 0 )
768 {
769 stylesStr += QString( "font-weight:900" );
770 continue;
771 }
772
773 // font stretch
774 if( str.compare( "ultracondensed", Qt::CaseInsensitive ) == 0 )
775 {
776 stylesStr += QString( "font-stretch:ultra-condensed;" );
777 continue;
778 }
779 if( str.compare( "extracondensed", Qt::CaseInsensitive ) == 0 )
780 {
781 stylesStr += QString( "font-stretch:extra-condensed;" );
782 continue;
783 }
784 if( str.compare( "semicondensed", Qt::CaseInsensitive ) == 0 )
785 {
786 stylesStr += QString( "font-stretch:semi-condensed;" );
787 continue;
788 }
789 if( str.compare( "semiexpanded", Qt::CaseInsensitive ) == 0 )
790 {
791 stylesStr += QString( "font-stretch:semi-expanded;" );
792 continue;
793 }
794 if( str.compare( "extraexpanded", Qt::CaseInsensitive ) == 0 )
795 {
796 stylesStr += QString( "font-stretch:extra-expanded;" );
797 continue;
798 }
799 if( str.compare( "ultraexpanded", Qt::CaseInsensitive ) == 0 )
800 {
801 stylesStr += QString( "font-stretch:ultra-expanded;" );
802 continue;
803 }
804 if( str.compare( "condensed", Qt::CaseInsensitive ) == 0
805 || str.compare( "expanded", Qt::CaseInsensitive ) == 0 )
806 {
807 stylesStr += QString( "font-stretch:" ) + str + ";";
808 continue;
809 }
810
811 // gravity
812 if( str.compare( "south", Qt::CaseInsensitive ) == 0
813 || str.compare( "east", Qt::CaseInsensitive ) == 0
814 || str.compare( "north", Qt::CaseInsensitive ) == 0
815 || str.compare( "west", Qt::CaseInsensitive ) == 0
816 || str.compare( "auto", Qt::CaseInsensitive ) == 0 )
817 {
818 continue;
819 }
820 break;
821 }
822
823 // last words is families list
824 if( n >= 0 )
825 {
826 familiesStr = QString( "font-family:" );
827 for( int i = 0; i <= n; i++ )
828 {
829 if( i > 0 && !familiesStr.endsWith( ',' ) )
830 familiesStr += ",";
831 familiesStr += list.at( i );
832 }
833 familiesStr += ";";
834 }
835
836 newSpan += familiesStr + stylesStr + sizeStr;
837 }
838 else if( style.compare( "font_family", Qt::CaseInsensitive ) == 0
839 || style.compare( "face", Qt::CaseInsensitive ) == 0 )
840 newSpan += QString( "font-family:" ) + styleRegex.cap( 2 ) + ";";
841 else if( style.compare( "font_size", Qt::CaseInsensitive ) == 0
842 || style.compare( "size", Qt::CaseInsensitive ) == 0 )
843 {
844 if( styleRegex.cap( 2 )[ 0 ].isLetter()
845 || styleRegex.cap( 2 ).endsWith( "px", Qt::CaseInsensitive )
846 || styleRegex.cap( 2 ).endsWith( "pt", Qt::CaseInsensitive )
847 || styleRegex.cap( 2 ).endsWith( "em", Qt::CaseInsensitive )
848 || styleRegex.cap( 2 ).endsWith( "%" ) )
849 newSpan += QString( "font-size:" ) + styleRegex.cap( 2 ) +";";
850 else
851 {
852 int size = styleRegex.cap( 2 ).toInt();
853 if( size )
854 newSpan += QString( "font-size:%1pt;" ).arg( size / 1024.0, 0, 'f', 3 );
855 }
856 }
857 else if( style.compare( "font_style", Qt::CaseInsensitive ) == 0
858 || style.compare( "style", Qt::CaseInsensitive ) == 0)
859 newSpan += QString( "font-style:" ) + styleRegex.cap( 2 ) + ";";
860 else if( style.compare( "weight", Qt::CaseInsensitive ) == 0
861 || style.compare( "weight", Qt::CaseInsensitive ) == 0)
862 {
863 QString str = styleRegex.cap( 2 );
864 if( str.compare( "ultralight", Qt::CaseInsensitive ) == 0 )
865 newSpan += QString( "font-weight:100;" );
866 else if( str.compare( "light", Qt::CaseInsensitive ) == 0 )
867 newSpan += QString( "font-weight:200;" );
868 else if( str.compare( "ultrabold", Qt::CaseInsensitive ) == 0 )
869 newSpan += QString( "font-weight:800;" );
870 else if( str.compare( "heavy", Qt::CaseInsensitive ) == 0 )
871 newSpan += QString( "font-weight:900" );
872 else
873 newSpan += QString( "font-weight:" ) + str + ";";
874 }
875 else if( style.compare( "font_variant", Qt::CaseInsensitive ) == 0
876 || style.compare( "variant", Qt::CaseInsensitive ) == 0 )
877 {
878 if( styleRegex.cap( 2 ).compare( "smallcaps", Qt::CaseInsensitive ) == 0 )
879 newSpan += QString( "font-variant:small-caps" );
880 else
881 newSpan += QString( "font-variant:" ) + styleRegex.cap( 2 ) + ";";
882 }
883 else if( style.compare( "font_stretch", Qt::CaseInsensitive ) == 0
884 || style.compare( "stretch", Qt::CaseInsensitive ) == 0 )
885 {
886 QString str = styleRegex.cap( 2 );
887 if( str.compare( "ultracondensed", Qt::CaseInsensitive ) == 0 )
888 newSpan += QString( "font-stretch:ultra-condensed;" );
889 else if( str.compare( "extracondensed", Qt::CaseInsensitive ) == 0 )
890 newSpan += QString( "font-stretch:extra-condensed;" );
891 else if( str.compare( "semicondensed", Qt::CaseInsensitive ) == 0 )
892 newSpan += QString( "font-stretch:semi-condensed;" );
893 else if( str.compare( "semiexpanded", Qt::CaseInsensitive ) == 0 )
894 newSpan += QString( "font-stretch:semi-expanded;" );
895 else if( str.compare( "extraexpanded", Qt::CaseInsensitive ) == 0 )
896 newSpan += QString( "font-stretch:extra-expanded;" );
897 else if( str.compare( "ultraexpanded", Qt::CaseInsensitive ) == 0 )
898 newSpan += QString( "font-stretch:ultra-expanded;" );
899 else
900 newSpan += QString( "font-stretch:" ) + str + ";";
901 }
902 else if( style.compare( "foreground", Qt::CaseInsensitive ) == 0
903 || style.compare( "fgcolor", Qt::CaseInsensitive ) == 0
904 || style.compare( "color", Qt::CaseInsensitive ) == 0 )
905 newSpan += QString( "color:" ) + styleRegex.cap( 2 ) + ";";
906 else if( style.compare( "background", Qt::CaseInsensitive ) == 0
907 || style.compare( "bgcolor", Qt::CaseInsensitive ) == 0 )
908 newSpan += QString( "background-color:" ) + styleRegex.cap( 2 ) + ";";
909 else if( style.compare( "underline_color", Qt::CaseInsensitive ) == 0
910 || style.compare( "strikethrough_color", Qt::CaseInsensitive ) == 0 )
911 newSpan += QString( "text-decoration-color:" ) + styleRegex.cap( 2 ) + ";";
912 else if( style.compare( "underline", Qt::CaseInsensitive ) == 0 )
913 {
914 if( styleRegex.cap( 2 ).compare( "none", Qt::CaseInsensitive ) )
915 newSpan += QString( "text-decoration-line:none;" );
916 else
917 {
918 newSpan += QString( "text-decoration-line:underline; " );
919 if( styleRegex.cap( 2 ).compare( "low", Qt::CaseInsensitive ) )
920 newSpan += QString( "text-decoration-style:dotted;" );
921 else if( styleRegex.cap( 2 ).compare( "single", Qt::CaseInsensitive ) )
922 newSpan += QString( "text-decoration-style:solid;" );
923 else if( styleRegex.cap( 2 ).compare( "error", Qt::CaseInsensitive ) )
924 newSpan += QString( "text-decoration-style:wavy;" );
925 else
926 newSpan += QString( "text-decoration-style:" ) + styleRegex.cap( 2 ) + ";";
927 }
928 }
929 else if( style.compare( "strikethrough", Qt::CaseInsensitive ) == 0 )
930 {
931 if( styleRegex.cap( 2 ).compare( "true", Qt::CaseInsensitive ) )
932 newSpan += QString( "text-decoration-line:line-through;" );
933 else
934 newSpan += QString( "text-decoration-line:none;" );
935 }
936 else if( style.compare( "rise", Qt::CaseInsensitive ) == 0 )
937 {
938 if( styleRegex.cap( 2 ).endsWith( "px", Qt::CaseInsensitive )
939 || styleRegex.cap( 2 ).endsWith( "pt", Qt::CaseInsensitive )
940 || styleRegex.cap( 2 ).endsWith( "em", Qt::CaseInsensitive )
941 || styleRegex.cap( 2 ).endsWith( "%" ) )
942 newSpan += QString( "vertical-align:" ) + styleRegex.cap( 2 ) +";";
943 else
944 {
945 int riseValue = styleRegex.cap( 2 ).toInt();
946 if( riseValue )
947 newSpan += QString( "vertical-align:%1pt;" ).arg( riseValue / 1024.0, 0, 'f', 3 );
948 }
949 }
950 else if( style.compare( "letter_spacing", Qt::CaseInsensitive ) == 0 )
951 {
952 if( styleRegex.cap( 2 ).endsWith( "px", Qt::CaseInsensitive )
953 || styleRegex.cap( 2 ).endsWith( "pt", Qt::CaseInsensitive )
954 || styleRegex.cap( 2 ).endsWith( "em", Qt::CaseInsensitive )
955 || styleRegex.cap( 2 ).endsWith( "%" ) )
956 newSpan += QString( "letter-spacing:" ) + styleRegex.cap( 2 ) +";";
957 else
958 {
959 int spacing = styleRegex.cap( 2 ).toInt();
960 if( spacing )
961 newSpan += QString( "letter-spacing:%1pt;" ).arg( spacing / 1024.0, 0, 'f', 3 );
962 }
963 }
964
965 stylePos += styleRegex.matchedLength();
966 }
967 }
968 while( stylePos >= 0 );
969
970 newSpan += "\">";
971 text.replace( pos, spanRegex.matchedLength(), newSpan );
972 pos += newSpan.size();
973 }
974 }
975 while( pos >= 0 );
976
977 text.replace( " ", " " );
978 }
979
980 void StardictDictionary::loadArticle( uint32_t address,
981 string & headword,
982 string & articleText )
983 {
984 uint32_t offset, size;
985
986 getArticleProps( address, headword, offset, size );
987
988 char * articleBody;
989
990 {
991 Mutex::Lock _( dzMutex );
992
993 // Note that the function always zero-pads the result.
994 articleBody = dict_data_read_( dz, offset, size, 0, 0 );
995 }
996
997 if ( !articleBody )
998 {
999 // throw exCantReadFile( getDictionaryFilenames()[ 2 ] );
1000 articleText = string( "<div class=\"sdict_m\">DICTZIP error: " ) + dict_error_str( dz ) + "</div>";
1001 return;
1002 }
1003
1004 articleText.clear();
1005
1006 char * ptr = articleBody;
1007
1008 if ( sameTypeSequence.size() )
1009 {
1010 /// The sequence is known, it's not stored in the article itself
1011 for( unsigned seq = 0; seq < sameTypeSequence.size(); ++seq )
1012 {
1013 // Last entry doesn't have size info -- it is inferred from
1014 // the bytes left
1015 bool entrySizeKnown = ( seq == sameTypeSequence.size() - 1 );
1016
1017 uint32_t entrySize = 0;
1018
1019 if ( entrySizeKnown )
1020 entrySize = size;
1021 else
1022 if ( !size )
1023 {
1024 gdWarning( "Stardict: short entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1025 break;
1026 }
1027
1028 char type = sameTypeSequence[ seq ];
1029
1030 if ( islower( type ) )
1031 {
1032 // Zero-terminated entry, unless it's the last one
1033 if ( !entrySizeKnown )
1034 entrySize = strlen( ptr );
1035
1036 if ( size < entrySize )
1037 {
1038 gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1039 break;
1040 }
1041
1042 articleText += handleResource( type, ptr, entrySize );
1043
1044 if ( !entrySizeKnown )
1045 ++entrySize; // Need to skip the zero byte
1046
1047 ptr += entrySize;
1048 size -= entrySize;
1049 }
1050 else
1051 if ( isupper( *ptr ) )
1052 {
1053 // An entry which has its size before contents, unless it's the last one
1054
1055 if ( !entrySizeKnown )
1056 {
1057 if ( size < sizeof( uint32_t ) )
1058 {
1059 gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1060 break;
1061 }
1062
1063 memcpy( &entrySize, ptr, sizeof( uint32_t ) );
1064
1065 entrySize = ntohl( entrySize );
1066
1067 ptr += sizeof( uint32_t );
1068 size -= sizeof( uint32_t );
1069 }
1070
1071 if ( size < entrySize )
1072 {
1073 gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1074 break;
1075 }
1076
1077 articleText += handleResource( type, ptr, entrySize );
1078
1079 ptr += entrySize;
1080 size -= entrySize;
1081 }
1082 else
1083 {
1084 gdWarning( "Stardict: non-alpha entry type 0x%x for the word %s encountered in \"%s\".\n",
1085 type, headword.c_str(), getName().c_str() );
1086 break;
1087 }
1088 }
1089 }
1090 else
1091 {
1092 // The sequence is stored in each article separately
1093 while( size )
1094 {
1095 if ( islower( *ptr ) )
1096 {
1097 // Zero-terminated entry
1098 size_t len = strlen( ptr + 1 );
1099
1100 if ( size < len + 2 )
1101 {
1102 gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1103 break;
1104 }
1105
1106 articleText += handleResource( *ptr, ptr + 1, len );
1107
1108 ptr += len + 2;
1109 size -= len + 2;
1110 }
1111 else
1112 if ( isupper( *ptr ) )
1113 {
1114 // An entry which havs its size before contents
1115 if ( size < sizeof( uint32_t ) + 1 )
1116 {
1117 gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1118 break;
1119 }
1120
1121 uint32_t entrySize;
1122
1123 memcpy( &entrySize, ptr + 1, sizeof( uint32_t ) );
1124
1125 entrySize = ntohl( entrySize );
1126
1127 if ( size < sizeof( uint32_t ) + 1 + entrySize )
1128 {
1129 gdWarning( "Stardict: malformed entry for the word %s encountered in \"%s\".\n", headword.c_str(), getName().c_str() );
1130 break;
1131 }
1132
1133 articleText += handleResource( *ptr, ptr + 1 + sizeof( uint32_t ), entrySize );
1134
1135 ptr += sizeof( uint32_t ) + 1 + entrySize;
1136 size -= sizeof( uint32_t ) + 1 + entrySize;
1137 }
1138 else
1139 {
1140 gdWarning( "Stardict: non-alpha entry type 0x%x for the word %s encountered in \"%s\".\n",
1141 (unsigned)*ptr, headword.c_str(), getName().c_str() );
1142 break;
1143 }
1144 }
1145 }
1146
1147 free( articleBody );
1148 }
1149
1150 QString const& StardictDictionary::getDescription()
1151 {
1152 if( !dictionaryDescription.isEmpty() )
1153 return dictionaryDescription;
1154
1155 File::Class ifoFile( getDictionaryFilenames()[ 0 ], "r" );
1156 Ifo ifo( ifoFile );
1157
1158 if( !ifo.copyright.empty() )
1159 {
1160 QString copyright = QString::fromUtf8( ifo.copyright.c_str() )
1161 .replace( "<br>", "\n", Qt::CaseInsensitive );
1162 dictionaryDescription += QString( QObject::tr( "Copyright: %1%2" ) )
1163 .arg( copyright )
1164 .arg( "\n\n" );
1165 }
1166
1167 if( !ifo.author.empty() )
1168 {
1169 QString author = QString::fromUtf8( ifo.author.c_str() );
1170 dictionaryDescription += QString( QObject::tr( "Author: %1%2" ) )
1171 .arg( author )
1172 .arg( "\n\n" );
1173 }
1174
1175 if( !ifo.email.empty() )
1176 {
1177 QString email = QString::fromUtf8( ifo.email.c_str() );
1178 dictionaryDescription += QString( QObject::tr( "E-mail: %1%2" ) )
1179 .arg( email )
1180 .arg( "\n\n" );
1181 }
1182
1183 if( !ifo.website.empty() )
1184 {
1185 QString website = QString::fromUtf8( ifo.website.c_str() );
1186 dictionaryDescription += QString( QObject::tr( "Website: %1%2" ) )
1187 .arg( website )
1188 .arg( "\n\n" );
1189 }
1190
1191 if( !ifo.date.empty() )
1192 {
1193 QString date = QString::fromUtf8( ifo.date.c_str() );
1194 dictionaryDescription += QString( QObject::tr( "Date: %1%2" ) )
1195 .arg( date )
1196 .arg( "\n\n" );
1197 }
1198
1199 if( !ifo.description.empty() )
1200 {
1201 QString desc = QString::fromUtf8( ifo.description.c_str() );
1202 desc.replace( "\t", "<br/>" );
1203 desc.replace( "\\n", "<br/>" );
1204 desc.replace( "<br>", "<br/>", Qt::CaseInsensitive );
1205 dictionaryDescription += Html::unescape( desc, true );
1206 }
1207
1208 if( dictionaryDescription.isEmpty() )
1209 dictionaryDescription = "NONE";
1210
1211 return dictionaryDescription;
1212 }
1213
1214 QString StardictDictionary::getMainFilename()
1215 {
1216 return FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() );
1217 }
1218
1219 void StardictDictionary::makeFTSIndex( QAtomicInt & isCancelled, bool firstIteration )
1220 {
1221 if( !( Dictionary::needToRebuildIndex( getDictionaryFilenames(), ftsIdxName )
1222 || FtsHelpers::ftsIndexIsOldOrBad( ftsIdxName, this ) ) )
1223 FTS_index_completed.ref();
1224
1225 if( haveFTSIndex() )
1226 return;
1227
1228 if( ensureInitDone().size() )
1229 return;
1230
1231 if( firstIteration && getArticleCount() > FTS::MaxDictionarySizeForFastSearch )
1232 return;
1233
1234 gdDebug( "Stardict: Building the full-text index for dictionary: %s\n",
1235 getName().c_str() );
1236
1237 try
1238 {
1239 FtsHelpers::makeFTSIndex( this, isCancelled );
1240 FTS_index_completed.ref();
1241 }
1242 catch( std::exception &ex )
1243 {
1244 gdWarning( "Stardict: Failed building full-text search index for \"%s\", reason: %s\n", getName().c_str(), ex.what() );
1245 QFile::remove( FsEncoding::decode( ftsIdxName.c_str() ) );
1246 }
1247 }
1248
1249 void StardictDictionary::getArticleText( uint32_t articleAddress, QString & headword, QString & text )
1250 {
1251 try
1252 {
1253 string headwordStr, articleStr;
1254 loadArticle( articleAddress, headwordStr, articleStr );
1255
1256 headword = QString::fromUtf8( headwordStr.data(), headwordStr.size() );
1257
1258 wstring wstr = Utf8::decode( articleStr );
1259
1260 text = Html::unescape( gd::toQString( wstr ) );
1261 }
1262 catch( std::exception &ex )
1263 {
1264 gdWarning( "Stardict: Failed retrieving article from \"%s\", reason: %s\n", getName().c_str(), ex.what() );
1265 }
1266 }
1267
1268 sptr< Dictionary::DataRequest > StardictDictionary::getSearchResults( QString const & searchString,
1269 int searchMode, bool matchCase,
1270 int distanceBetweenWords,
1271 int maxResults,
1272 bool ignoreWordsOrder,
1273 bool ignoreDiacritics )
1274 {
1275 return new FtsHelpers::FTSResultsRequest( *this, searchString,searchMode, matchCase, distanceBetweenWords, maxResults, ignoreWordsOrder, ignoreDiacritics );
1276 }
1277
1278 /// StardictDictionary::findHeadwordsForSynonym()
1279
1280 class StardictHeadwordsRequest;
1281
1282 class StardictHeadwordsRequestRunnable: public QRunnable
1283 {
1284 StardictHeadwordsRequest & r;
1285 QSemaphore & hasExited;
1286
1287 public:
1288
1289 StardictHeadwordsRequestRunnable( StardictHeadwordsRequest & r_,
1290 QSemaphore & hasExited_ ): r( r_ ),
1291 hasExited( hasExited_ )
1292 {}
1293
1294 ~StardictHeadwordsRequestRunnable()
1295 {
1296 hasExited.release();
1297 }
1298
1299 virtual void run();
1300 };
1301
1302 class StardictHeadwordsRequest: public Dictionary::WordSearchRequest
1303 {
1304 friend class StardictHeadwordsRequestRunnable;
1305
1306 wstring word;
1307 StardictDictionary & dict;
1308
1309 QAtomicInt isCancelled;
1310 QSemaphore hasExited;
1311
1312 public:
1313
1314 StardictHeadwordsRequest( wstring const & word_,
1315 StardictDictionary & dict_ ):
1316 word( word_ ), dict( dict_ )
1317 {
1318 QThreadPool::globalInstance()->start(
1319 new StardictHeadwordsRequestRunnable( *this, hasExited ) );
1320 }
1321
1322 void run(); // Run from another thread by StardictHeadwordsRequestRunnable
1323
1324 virtual void cancel()
1325 {
1326 isCancelled.ref();
1327 }
1328
1329 ~StardictHeadwordsRequest()
1330 {
1331 isCancelled.ref();
1332 hasExited.acquire();
1333 }
1334 };
1335
1336 void StardictHeadwordsRequestRunnable::run()
1337 {
1338 r.run();
1339 }
1340
1341 void StardictHeadwordsRequest::run()
1342 {
1343 if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1344 {
1345 finish();
1346 return;
1347 }
1348
1349 try
1350 {
1351 vector< WordArticleLink > chain = dict.findArticles( word );
1352
1353 wstring caseFolded = Folding::applySimpleCaseOnly( word );
1354
1355 for( unsigned x = 0; x < chain.size(); ++x )
1356 {
1357 if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1358 {
1359 finish();
1360 return;
1361 }
1362
1363 string headword, articleText;
1364
1365 dict.loadArticle( chain[ x ].articleOffset,
1366 headword, articleText );
1367
1368 wstring headwordDecoded = Utf8::decode( headword );
1369
1370 if ( caseFolded != Folding::applySimpleCaseOnly( headwordDecoded ) )
1371 {
1372 // The headword seems to differ from the input word, which makes the
1373 // input word its synonym.
1374 Mutex::Lock _( dataMutex );
1375
1376 matches.push_back( headwordDecoded );
1377 }
1378 }
1379 }
1380 catch( std::exception & e )
1381 {
1382 setErrorString( QString::fromUtf8( e.what() ) );
1383 }
1384
1385 finish();
1386 }
1387
1388 sptr< Dictionary::WordSearchRequest >
1389 StardictDictionary::findHeadwordsForSynonym( wstring const & word )
1390 THROW_SPEC( std::exception )
1391 {
1392 return synonymSearchEnabled ? new StardictHeadwordsRequest( word, *this ) :
1393 Class::findHeadwordsForSynonym( word );
1394 }
1395
1396
1397 /// StardictDictionary::getArticle()
1398
1399 class StardictArticleRequest;
1400
1401 class StardictArticleRequestRunnable: public QRunnable
1402 {
1403 StardictArticleRequest & r;
1404 QSemaphore & hasExited;
1405
1406 public:
1407
1408 StardictArticleRequestRunnable( StardictArticleRequest & r_,
1409 QSemaphore & hasExited_ ): r( r_ ),
1410 hasExited( hasExited_ )
1411 {}
1412
1413 ~StardictArticleRequestRunnable()
1414 {
1415 hasExited.release();
1416 }
1417
1418 virtual void run();
1419 };
1420
1421 class StardictArticleRequest: public Dictionary::DataRequest
1422 {
1423 friend class StardictArticleRequestRunnable;
1424
1425 wstring word;
1426 vector< wstring > alts;
1427 StardictDictionary & dict;
1428 bool ignoreDiacritics;
1429
1430 QAtomicInt isCancelled;
1431 QSemaphore hasExited;
1432
1433 public:
1434
1435 StardictArticleRequest( wstring const & word_,
1436 vector< wstring > const & alts_,
1437 StardictDictionary & dict_,
1438 bool ignoreDiacritics_ ):
1439 word( word_ ), alts( alts_ ), dict( dict_ ), ignoreDiacritics( ignoreDiacritics_ )
1440 {
1441 QThreadPool::globalInstance()->start(
1442 new StardictArticleRequestRunnable( *this, hasExited ) );
1443 }
1444
1445 void run(); // Run from another thread by StardictArticleRequestRunnable
1446
1447 virtual void cancel()
1448 {
1449 isCancelled.ref();
1450 }
1451
1452 ~StardictArticleRequest()
1453 {
1454 isCancelled.ref();
1455 hasExited.acquire();
1456 }
1457 };
1458
1459 void StardictArticleRequestRunnable::run()
1460 {
1461 r.run();
1462 }
1463
1464 void StardictArticleRequest::run()
1465 {
1466 if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1467 {
1468 finish();
1469 return;
1470 }
1471
1472 try
1473 {
1474 vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
1475
1476 for( unsigned x = 0; x < alts.size(); ++x )
1477 {
1478 /// Make an additional query for each alt
1479
1480 vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
1481
1482 chain.insert( chain.end(), altChain.begin(), altChain.end() );
1483 }
1484
1485 multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
1486
1487 set< uint32_t > articlesIncluded; // Some synonims make it that the articles
1488 // appear several times. We combat this
1489 // by only allowing them to appear once.
1490
1491 wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
1492 if( ignoreDiacritics )
1493 wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
1494
1495 for( unsigned x = 0; x < chain.size(); ++x )
1496 {
1497 if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1498 {
1499 finish();
1500 return;
1501 }
1502
1503 if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
1504 continue; // We already have this article in the body.
1505
1506 // Now grab that article
1507
1508 string headword, articleText;
1509
1510 dict.loadArticle( chain[ x ].articleOffset, headword, articleText );
1511
1512 // Ok. Now, does it go to main articles, or to alternate ones? We list
1513 // main ones first, and alternates after.
1514
1515 // We do the case-folded comparison here.
1516
1517 wstring headwordStripped =
1518 Folding::applySimpleCaseOnly( Utf8::decode( headword ) );
1519 if( ignoreDiacritics )
1520 headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
1521
1522 multimap< wstring, pair< string, string > > & mapToUse =
1523 ( wordCaseFolded == headwordStripped ) ?
1524 mainArticles : alternateArticles;
1525
1526 mapToUse.insert( pair< wstring, pair< string, string > >(
1527 Folding::applySimpleCaseOnly( Utf8::decode( headword ) ),
1528 pair< string, string >( headword, articleText ) ) );
1529
1530 articlesIncluded.insert( chain[ x ].articleOffset );
1531 }
1532
1533 if ( mainArticles.empty() && alternateArticles.empty() )
1534 {
1535 // No such word
1536 finish();
1537 return;
1538 }
1539
1540 string result;
1541
1542 multimap< wstring, pair< string, string > >::const_iterator i;
1543
1544 string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
1545 "</font>""</font>""</font>""</font>""</font>""</font>"
1546 "</b></b></b></b></b></b></b></b>"
1547 "</i></i></i></i></i></i></i></i>";
1548
1549 for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
1550 {
1551 result += dict.isFromLanguageRTL() ? "<h3 class=\"sdct_headwords\" dir=\"rtl\">" : "<h3 class=\"sdct_headwords\">";
1552 result += i->second.first;
1553 result += "</h3>";
1554 if( dict.isToLanguageRTL() )
1555 result += "<div style=\"display:inline;\" dir=\"rtl\">";
1556 result += i->second.second;
1557 result += cleaner;
1558 if( dict.isToLanguageRTL() )
1559 result += "</div>";
1560 }
1561
1562 for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
1563 {
1564 result += dict.isFromLanguageRTL() ? "<h3 class=\"sdct_headwords\" dir=\"rtl\">" : "<h3 class=\"sdct_headwords\">";
1565 result += i->second.first;
1566 result += "</h3>";
1567 if( dict.isToLanguageRTL() )
1568 result += "<div style=\"display:inline;\" dir=\"rtl\">";
1569 result += i->second.second;
1570 result += cleaner;
1571 if( dict.isToLanguageRTL() )
1572 result += "</div>";
1573 }
1574
1575 Mutex::Lock _( dataMutex );
1576
1577 data.resize( result.size() );
1578
1579 memcpy( &data.front(), result.data(), result.size() );
1580
1581 hasAnyData = true;
1582 }
1583 catch( std::exception & e )
1584 {
1585 setErrorString( QString::fromUtf8( e.what() ) );
1586 }
1587
1588 finish();
1589 }
1590
1591 sptr< Dictionary::DataRequest > StardictDictionary::getArticle( wstring const & word,
1592 vector< wstring > const & alts,
1593 wstring const &,
1594 bool ignoreDiacritics )
1595 THROW_SPEC( std::exception )
1596 {
1597 return new StardictArticleRequest( word, alts, *this, ignoreDiacritics );
1598 }
1599
1600
1601 static char const * beginsWith( char const * substr, char const * str )
1602 {
1603 size_t len = strlen( substr );
1604
1605 return strncmp( str, substr, len ) == 0 ? str + len : 0;
1606 }
1607
1608 Ifo::Ifo( File::Class & f ):
1609 wordcount( 0 ), synwordcount( 0 ), idxfilesize( 0 ), idxoffsetbits( 32 )
1610 {
1611 static string const versionEq( "version=" );
1612
1613 static string const booknameEq( "bookname=" );
1614
1615 //DPRINTF( "%s<\n", f.gets().c_str() );
1616 //DPRINTF( "%s<\n", f.gets().c_str() );
1617
1618 if ( QString::fromUtf8(f.gets().c_str()) != "StarDict's dict ifo file" ||
1619 f.gets().compare( 0, versionEq.size(), versionEq ) )
1620 throw exNotAnIfoFile();
1621
1622 /// Now go through the file and parse options
1623
1624 try
1625 {
1626 char option[ 16384 ];
1627
1628 for( ; ; )
1629 {
1630 if ( !f.gets( option, sizeof( option ), true ) )
1631 break;
1632
1633 if ( char const * val = beginsWith( "bookname=", option ) )
1634 bookname = val;
1635 else
1636 if ( char const * val = beginsWith( "wordcount=", option ) )
1637 {
1638 if ( sscanf( val, "%u", & wordcount ) != 1 )
1639 throw exBadFieldInIfo( option );
1640 }
1641 else
1642 if ( char const * val = beginsWith( "synwordcount=", option ) )
1643 {
1644 if ( sscanf( val, "%u", & synwordcount ) != 1 )
1645 throw exBadFieldInIfo( option );
1646 }
1647 else
1648 if ( char const * val = beginsWith( "idxfilesize=", option ) )
1649 {
1650 if ( sscanf( val, "%u", & idxfilesize ) != 1 )
1651 throw exBadFieldInIfo( option );
1652 }
1653 else
1654 if ( char const * val = beginsWith( "idxoffsetbits=", option ) )
1655 {
1656 if ( sscanf( val, "%u", & idxoffsetbits ) != 1 || ( idxoffsetbits != 32
1657 && idxoffsetbits != 64 ) )
1658 throw exBadFieldInIfo( option );
1659 }
1660 else
1661 if ( char const * val = beginsWith( "sametypesequence=", option ) )
1662 sametypesequence = val;
1663 else
1664 if ( char const * val = beginsWith( "dicttype=", option ) )
1665 dicttype = val;
1666 else
1667 if ( char const * val = beginsWith( "description=", option ) )
1668 description = val;
1669 else
1670 if ( char const * val = beginsWith( "copyright=", option ) )
1671 copyright = val;
1672 else
1673 if ( char const * val = beginsWith( "author=", option ) )
1674 author = val;
1675 else
1676 if ( char const * val = beginsWith( "email=", option ) )
1677 email = val;
1678 else
1679 if ( char const * val = beginsWith( "website=", option ) )
1680 website = val;
1681 else
1682 if ( char const * val = beginsWith( "date=", option ) )
1683 date = val;
1684 }
1685 }
1686 catch( File::exReadError & )
1687 {
1688 }
1689 }
1690
1691 //// StardictDictionary::getResource()
1692
1693 class StardictResourceRequest;
1694
1695 class StardictResourceRequestRunnable: public QRunnable
1696 {
1697 StardictResourceRequest & r;
1698 QSemaphore & hasExited;
1699
1700 public:
1701
1702 StardictResourceRequestRunnable( StardictResourceRequest & r_,
1703 QSemaphore & hasExited_ ): r( r_ ),
1704 hasExited( hasExited_ )
1705 {}
1706
1707 ~StardictResourceRequestRunnable()
1708 {
1709 hasExited.release();
1710 }
1711
1712 virtual void run();
1713 };
1714
1715 class StardictResourceRequest: public Dictionary::DataRequest
1716 {
1717 friend class StardictResourceRequestRunnable;
1718
1719 StardictDictionary & dict;
1720
1721 string resourceName;
1722
1723 QAtomicInt isCancelled;
1724 QSemaphore hasExited;
1725
1726 public:
1727
1728 StardictResourceRequest( StardictDictionary & dict_,
1729 string const & resourceName_ ):
1730 dict( dict_ ),
1731 resourceName( resourceName_ )
1732 {
1733 QThreadPool::globalInstance()->start(
1734 new StardictResourceRequestRunnable( *this, hasExited ) );
1735 }
1736
1737 void run(); // Run from another thread by StardictResourceRequestRunnable
1738
1739 virtual void cancel()
1740 {
1741 isCancelled.ref();
1742 }
1743
1744 ~StardictResourceRequest()
1745 {
1746 isCancelled.ref();
1747 hasExited.acquire();
1748 }
1749 };
1750
1751 void StardictResourceRequestRunnable::run()
1752 {
1753 r.run();
1754 }
1755
1756 void StardictResourceRequest::run()
1757 {
1758 // Some runnables linger enough that they are cancelled before they start
1759 if ( Qt4x5::AtomicInt::loadAcquire( isCancelled ) )
1760 {
1761 finish();
1762 return;
1763 }
1764
1765 try
1766 {
1767 if( resourceName.at( 0 ) == '\x1E' )
1768 resourceName = resourceName.erase( 0, 1 );
1769 if( resourceName.at( resourceName.length() - 1 ) == '\x1F' )
1770 resourceName.erase( resourceName.length() - 1, 1 );
1771
1772 string n =
1773 FsEncoding::dirname( dict.getDictionaryFilenames()[ 0 ] ) +
1774 FsEncoding::separator() +
1775 "res" +
1776 FsEncoding::separator() +
1777 FsEncoding::encode( resourceName );
1778
1779 GD_DPRINTF( "n is %s\n", n.c_str() );
1780
1781 try
1782 {
1783 Mutex::Lock _( dataMutex );
1784
1785 File::loadFromFile( n, data );
1786 }
1787 catch( File::exCantOpen & )
1788 {
1789 // Try reading from zip file
1790
1791 if ( dict.resourceZip.isOpen() )
1792 {
1793 Mutex::Lock _( dict.resourceZipMutex );
1794
1795 Mutex::Lock __( dataMutex );
1796
1797 if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) )
1798 throw; // Make it fail since we couldn't read the archive
1799 }
1800 else
1801 throw;
1802 }
1803
1804 if ( Filetype::isNameOfTiff( resourceName ) )
1805 {
1806 // Convert it
1807
1808 dataMutex.lock();
1809
1810 QImage img = QImage::fromData( (unsigned char *) &data.front(),
1811 data.size() );
1812
1813 #ifdef MAKE_EXTRA_TIFF_HANDLER
1814 if( img.isNull() )
1815 GdTiff::tiffToQImage( &data.front(), data.size(), img );
1816 #endif
1817
1818 dataMutex.unlock();
1819
1820 if ( !img.isNull() )
1821 {
1822 // Managed to load -- now store it back as BMP
1823
1824 QByteArray ba;
1825 QBuffer buffer( &ba );
1826 buffer.open( QIODevice::WriteOnly );
1827 img.save( &buffer, "BMP" );
1828
1829 Mutex::Lock _( dataMutex );
1830
1831 data.resize( buffer.size() );
1832
1833 memcpy( &data.front(), buffer.data(), data.size() );
1834 }
1835 }
1836
1837 if( Filetype::isNameOfCSS( resourceName ) )
1838 {
1839 Mutex::Lock _( dataMutex );
1840
1841 QString css = QString::fromUtf8( data.data(), data.size() );
1842
1843 // Correct some url's
1844
1845 QString id = QString::fromUtf8( dict.getId().c_str() );
1846 int pos = 0;
1847
1848 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
1849 QRegularExpression links( "url\\(\\s*(['\"]?)([^'\"]*)(['\"]?)\\s*\\)",
1850 QRegularExpression::CaseInsensitiveOption );
1851
1852 QString newCSS;
1853 QRegularExpressionMatchIterator it = links.globalMatch( css );
1854 while( it.hasNext() )
1855 {
1856 QRegularExpressionMatch match = it.next();
1857 newCSS += css.midRef( pos, match.capturedStart() - pos );
1858 pos = match.capturedEnd();
1859
1860 QString url = match.captured( 2 );
1861
1862 if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0)
1863 {
1864 // External link
1865 newCSS += match.captured();
1866 continue;
1867 }
1868
1869 QString newUrl = QString( "url(" ) + match.captured( 1 ) + "bres://"
1870 + id + "/" + url + match.captured( 3 ) + ")";
1871 newCSS += newUrl;
1872 }
1873 if( pos )
1874 {
1875 newCSS += css.midRef( pos );
1876 css = newCSS;
1877 newCSS.clear();
1878 }
1879 #else
1880 QRegExp links( "url\\(\\s*(['\"]?)([^'\"]*)(['\"]?)\\s*\\)", Qt::CaseInsensitive, QRegExp::RegExp );
1881 for( ; ; )
1882 {
1883 pos = links.indexIn( css, pos );
1884 if( pos < 0 )
1885 break;
1886 QString url = links.cap( 2 );
1887
1888 if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0)
1889 {
1890 // External link
1891 pos += links.cap().size();
1892 continue;
1893 }
1894
1895 QString newUrl = QString( "url(" ) + links.cap( 1 ) + "bres://"
1896 + id + "/" + url + links.cap( 3 ) + ")";
1897 css.replace( pos, links.cap().size(), newUrl );
1898 pos += newUrl.size();
1899 }
1900 #endif
1901
1902 dict.isolateCSS( css );
1903 QByteArray bytes = css.toUtf8();
1904 data.resize( bytes.size() );
1905 memcpy( &data.front(), bytes.constData(), bytes.size() );
1906 }
1907
1908 Mutex::Lock _( dataMutex );
1909 hasAnyData = true;
1910 }
1911 catch( std::exception &ex )
1912 {
1913 gdWarning( "Stardict: Failed loading resource \"%s\" for \"%s\", reason: %s\n",
1914 resourceName.c_str(), dict.getName().c_str(), ex.what() );
1915 // Resource not loaded -- we don't set the hasAnyData flag then
1916 }
1917 catch( ... )
1918 {
1919 }
1920
1921 finish();
1922 }
1923
1924 sptr< Dictionary::DataRequest > StardictDictionary::getResource( string const & name )
1925 THROW_SPEC( std::exception )
1926 {
1927 return new StardictResourceRequest( *this, name );
1928 }
1929
1930 } // anonymous namespace
1931
findCorrespondingFiles(string const & ifo,string & idx,string & dict,string & syn)1932 static void findCorrespondingFiles( string const & ifo,
1933 string & idx, string & dict, string & syn )
1934 {
1935 string base( ifo, 0, ifo.size() - 3 );
1936
1937 if ( !(
1938 File::tryPossibleName( base + "idx", idx ) ||
1939 File::tryPossibleName( base + "idx.gz", idx ) ||
1940 File::tryPossibleName( base + "idx.dz", idx ) ||
1941 File::tryPossibleName( base + "IDX", idx ) ||
1942 File::tryPossibleName( base + "IDX.GZ", idx ) ||
1943 File::tryPossibleName( base + "IDX.DZ", idx )
1944 ) )
1945 throw exNoIdxFile( ifo );
1946
1947 if ( !(
1948 File::tryPossibleName( base + "dict", dict ) ||
1949 File::tryPossibleName( base + "dict.dz", dict ) ||
1950 File::tryPossibleName( base + "DICT", dict ) ||
1951 File::tryPossibleName( base + "dict.DZ", dict )
1952 ) )
1953 throw exNoDictFile( ifo );
1954
1955 if ( !(
1956 File::tryPossibleName( base + "syn", syn ) ||
1957 File::tryPossibleName( base + "syn.gz", syn ) ||
1958 File::tryPossibleName( base + "syn.dz", syn ) ||
1959 File::tryPossibleName( base + "SYN", syn ) ||
1960 File::tryPossibleName( base + "SYN.GZ", syn ) ||
1961 File::tryPossibleName( base + "SYN.DZ", syn )
1962 ) )
1963 syn.clear();
1964 }
1965
handleIdxSynFile(string const & fileName,IndexedWords & indexedWords,ChunkedStorage::Writer & chunks,vector<uint32_t> * articleOffsets,bool isSynFile,bool parseHeadwords)1966 static void handleIdxSynFile( string const & fileName,
1967 IndexedWords & indexedWords,
1968 ChunkedStorage::Writer & chunks,
1969 vector< uint32_t > * articleOffsets,
1970 bool isSynFile, bool parseHeadwords )
1971 {
1972 gzFile stardictIdx = gd_gzopen( fileName.c_str() );
1973 if ( !stardictIdx )
1974 throw exCantReadFile( fileName );
1975
1976 vector< char > image;
1977
1978 for( ; ; )
1979 {
1980 size_t oldSize = image.size();
1981
1982 image.resize( oldSize + 65536 );
1983
1984 int rd = gzread( stardictIdx, &image.front() + oldSize, 65536 );
1985
1986 if ( rd < 0 )
1987 {
1988 gzclose( stardictIdx );
1989 throw exCantReadFile( fileName );
1990 }
1991
1992 if ( rd != 65536 )
1993 {
1994 image.resize( oldSize + rd + 1 );
1995 break;
1996 }
1997 }
1998 gzclose( stardictIdx );
1999
2000 // We append one zero byte to catch runaway string at the end, if any
2001
2002 image.back() = 0;
2003
2004 // Now parse it
2005
2006 for( char const * ptr = &image.front(); ptr != &image.back(); )
2007 {
2008 size_t wordLen = strlen( ptr );
2009
2010 if ( ptr + wordLen + 1 + ( isSynFile ? sizeof( uint32_t ) :
2011 sizeof( uint32_t ) * 2 ) >
2012 &image.back() )
2013 {
2014 GD_FDPRINTF( stderr, "Warning: sudden end of file %s\n", fileName.c_str() );
2015 break;
2016 }
2017
2018 char const * word = ptr;
2019
2020 ptr += wordLen + 1;
2021
2022 uint32_t offset;
2023
2024 if( strstr( word, "&#" ) )
2025 {
2026 // Decode some html-coded symbols in headword
2027 string unescapedWord = Html::unescapeUtf8( word );
2028 strncpy( (char *)word, unescapedWord.c_str(), wordLen );
2029 wordLen = strlen( word );
2030 }
2031
2032 if ( !isSynFile )
2033 {
2034 // We're processing the .idx file
2035 uint32_t articleOffset, articleSize;
2036
2037 memcpy( &articleOffset, ptr, sizeof( uint32_t ) );
2038 ptr += sizeof( uint32_t );
2039 memcpy( &articleSize, ptr, sizeof( uint32_t ) );
2040 ptr += sizeof( uint32_t );
2041
2042 articleOffset = ntohl( articleOffset );
2043 articleSize = ntohl( articleSize );
2044
2045 // Create an entry for the article in the chunked storage
2046
2047 offset = chunks.startNewBlock();
2048
2049 if ( articleOffsets )
2050 articleOffsets->push_back( offset );
2051
2052 chunks.addToBlock( &articleOffset, sizeof( uint32_t ) );
2053 chunks.addToBlock( &articleSize, sizeof( uint32_t ) );
2054 chunks.addToBlock( word, wordLen + 1 );
2055 }
2056 else
2057 {
2058 // We're processing the .syn file
2059 uint32_t offsetInIndex;
2060
2061 memcpy( &offsetInIndex, ptr, sizeof( uint32_t ) );
2062 ptr += sizeof( uint32_t );
2063
2064 offsetInIndex = ntohl( offsetInIndex );
2065
2066 if ( offsetInIndex >= articleOffsets->size() )
2067 throw exIncorrectOffset( fileName );
2068
2069 offset = (*articleOffsets)[ offsetInIndex ];
2070
2071 // Some StarDict dictionaries are in fact badly converted Babylon ones.
2072 // They contain a lot of superfluous slashed entries with dollar signs.
2073 // We try to filter them out here, since those entries become much more
2074 // apparent in GoldenDict than they were in StarDict because of
2075 // punctuation folding. Hopefully there are not a whole lot of valid
2076 // synonyms which really start from slash and contain dollar signs, or
2077 // end with dollar and contain slashes.
2078 if ( *word == '/' )
2079 {
2080 if ( strchr( word, '$' ) )
2081 continue; // Skip this entry
2082 }
2083 else
2084 if ( wordLen && word[ wordLen - 1 ] == '$' )
2085 {
2086 if ( strchr( word, '/' ) )
2087 continue; // Skip this entry
2088 }
2089 }
2090
2091 // Insert new entry into an index
2092
2093 if( parseHeadwords )
2094 indexedWords.addWord( Utf8::decode( word ), offset );
2095 else
2096 indexedWords.addSingleWord( Utf8::decode( word ), offset );
2097 }
2098
2099 GD_DPRINTF( "%u entires made\n", (unsigned) indexedWords.size() );
2100 }
2101
2102
makeDictionaries(vector<string> const & fileNames,string const & indicesDir,Dictionary::Initializing & initializing,unsigned maxHeadwordsToExpand)2103 vector< sptr< Dictionary::Class > > makeDictionaries(
2104 vector< string > const & fileNames,
2105 string const & indicesDir,
2106 Dictionary::Initializing & initializing,
2107 unsigned maxHeadwordsToExpand )
2108 THROW_SPEC( std::exception )
2109 {
2110 vector< sptr< Dictionary::Class > > dictionaries;
2111
2112 for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
2113 ++i )
2114 {
2115 if ( i->size() < 4 ||
2116 strcasecmp( i->c_str() + ( i->size() - 4 ), ".ifo" ) != 0 )
2117 continue;
2118
2119 try
2120 {
2121 vector< string > dictFiles( 1, *i );
2122
2123 string idxFileName, dictFileName, synFileName;
2124
2125 findCorrespondingFiles( *i, idxFileName, dictFileName, synFileName );
2126
2127 dictFiles.push_back( idxFileName );
2128 dictFiles.push_back( dictFileName );
2129
2130 if ( synFileName.size() )
2131 dictFiles.push_back( synFileName );
2132
2133 // See if there's a zip file with resources present. If so, include it.
2134
2135 string zipFileName;
2136 string baseName = FsEncoding::dirname( idxFileName ) + FsEncoding::separator();
2137
2138 if ( File::tryPossibleZipName( baseName + "res.zip", zipFileName ) ||
2139 File::tryPossibleZipName( baseName + "RES.ZIP", zipFileName ) ||
2140 File::tryPossibleZipName( baseName + "res" + FsEncoding::separator() + "res.zip", zipFileName ) )
2141 dictFiles.push_back( zipFileName );
2142
2143 string dictId = Dictionary::makeDictionaryId( dictFiles );
2144
2145 string indexFile = indicesDir + dictId;
2146
2147 if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
2148 indexIsOldOrBad( indexFile ) )
2149 {
2150 // Building the index
2151
2152 File::Class ifoFile( *i, "r" );
2153
2154 Ifo ifo( ifoFile );
2155
2156 gdDebug( "Stardict: Building the index for dictionary: %s\n", ifo.bookname.c_str() );
2157
2158 if ( ifo.idxoffsetbits == 64 )
2159 throw ex64BitsNotSupported();
2160
2161 if ( ifo.dicttype.size() )
2162 throw exDicttypeNotSupported();
2163
2164 if( synFileName.empty() )
2165 {
2166 if ( ifo.synwordcount )
2167 {
2168 GD_DPRINTF( "Warning: dictionary has synwordcount specified, but no "
2169 "corresponding .syn file was found\n" );
2170 ifo.synwordcount = 0; // Pretend it wasn't there
2171 }
2172 }
2173 else
2174 if ( !ifo.synwordcount )
2175 {
2176 GD_DPRINTF( "Warning: ignoring .syn file %s, since there's no synwordcount in .ifo specified\n",
2177 synFileName.c_str() );
2178 }
2179
2180
2181 GD_DPRINTF( "bookname = %s\n", ifo.bookname.c_str() );
2182 GD_DPRINTF( "wordcount = %u\n", ifo.wordcount );
2183
2184 initializing.indexingDictionary( ifo.bookname );
2185
2186 File::Class idx( indexFile, "wb" );
2187
2188 IdxHeader idxHeader;
2189
2190 memset( &idxHeader, 0, sizeof( idxHeader ) );
2191
2192 // We write a dummy header first. At the end of the process the header
2193 // will be rewritten with the right values.
2194
2195 idx.write( idxHeader );
2196
2197 idx.write( ifo.bookname.data(), ifo.bookname.size() );
2198 idx.write( ifo.sametypesequence.data(), ifo.sametypesequence.size() );
2199
2200 IndexedWords indexedWords;
2201
2202 ChunkedStorage::Writer chunks( idx );
2203
2204 // Load indices
2205 if ( !ifo.synwordcount )
2206 handleIdxSynFile( idxFileName, indexedWords, chunks, 0, false,
2207 !maxHeadwordsToExpand || ifo.wordcount < maxHeadwordsToExpand );
2208 else
2209 {
2210 vector< uint32_t > articleOffsets;
2211
2212 articleOffsets.reserve( ifo.wordcount );
2213
2214 handleIdxSynFile( idxFileName, indexedWords, chunks, &articleOffsets,
2215 false,
2216 !maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand );
2217
2218 handleIdxSynFile( synFileName, indexedWords, chunks, &articleOffsets,
2219 true,
2220 !maxHeadwordsToExpand || ( ifo.wordcount + ifo.synwordcount ) < maxHeadwordsToExpand );
2221 }
2222
2223 // Finish with the chunks
2224
2225 idxHeader.chunksOffset = chunks.finish();
2226
2227 // Build index
2228
2229 IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
2230
2231 idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
2232 idxHeader.indexRootOffset = idxInfo.rootOffset;
2233
2234 // That concludes it. Update the header.
2235
2236 idxHeader.signature = Signature;
2237 idxHeader.formatVersion = CurrentFormatVersion;
2238
2239 idxHeader.wordCount = ifo.wordcount;
2240 idxHeader.synWordCount = ifo.synwordcount;
2241 idxHeader.bookNameSize = ifo.bookname.size();
2242 idxHeader.sameTypeSequenceSize = ifo.sametypesequence.size();
2243
2244 // read languages
2245 QPair<quint32,quint32> langs =
2246 LangCoder::findIdsForFilename( QString::fromStdString( dictFileName ) );
2247
2248 // if no languages found, try dictionary's name
2249 if ( langs.first == 0 || langs.second == 0 )
2250 {
2251 langs =
2252 LangCoder::findIdsForFilename( QString::fromStdString( ifo.bookname ) );
2253 }
2254
2255 idxHeader.langFrom = langs.first;
2256 idxHeader.langTo = langs.second;
2257
2258 // If there was a zip file, index it too
2259
2260 if ( zipFileName.size() )
2261 {
2262 GD_DPRINTF( "Indexing zip file\n" );
2263
2264 idxHeader.hasZipFile = 1;
2265
2266 IndexedWords zipFileNames;
2267 IndexedZip zipFile;
2268 if( zipFile.openZipFile( QDir::fromNativeSeparators(
2269 FsEncoding::decode( zipFileName.c_str() ) ) ) )
2270 zipFile.indexFile( zipFileNames );
2271
2272 if( !zipFileNames.empty() )
2273 {
2274 // Build the resulting zip file index
2275
2276 IndexInfo idxInfo = BtreeIndexing::buildIndex( zipFileNames, idx );
2277
2278 idxHeader.zipIndexBtreeMaxElements = idxInfo.btreeMaxElements;
2279 idxHeader.zipIndexRootOffset = idxInfo.rootOffset;
2280 }
2281 else
2282 {
2283 // Bad zip file -- no index (though the mark that we have one
2284 // remains)
2285 idxHeader.zipIndexBtreeMaxElements = 0;
2286 idxHeader.zipIndexRootOffset = 0;
2287 }
2288 }
2289 else
2290 idxHeader.hasZipFile = 0;
2291
2292 // That concludes it. Update the header.
2293
2294 idx.rewind();
2295
2296 idx.write( &idxHeader, sizeof( idxHeader ) );
2297 }
2298
2299 dictionaries.push_back( new StardictDictionary( dictId,
2300 indexFile,
2301 dictFiles ) );
2302 }
2303 catch( std::exception & e )
2304 {
2305 gdWarning( "Stardict dictionary initializing failed: %s, error: %s\n",
2306 i->c_str(), e.what() );
2307 }
2308 }
2309
2310 return dictionaries;
2311 }
2312
2313
2314 }
2315