1 /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2  * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
3 
4 #include "lsa.hh"
5 #include "file.hh"
6 #include "iconv.hh"
7 #include "folding.hh"
8 #include "utf8.hh"
9 #include "btreeidx.hh"
10 #include "fsencoding.hh"
11 #include "audiolink.hh"
12 #include "gddebug.hh"
13 
14 #include <set>
15 #include <string>
16 
17 #ifdef _MSC_VER
18 #include <stub_msvc.h>
19 #endif
20 
21 #define OV_EXCLUDE_STATIC_CALLBACKS
22 #include <vorbis/vorbisfile.h>
23 #include <QDir>
24 #include <QUrl>
25 #include <QDebug>
26 #include <QFile>
27 
28 #include "qt4x5.hh"
29 
30 namespace Lsa {
31 
32 using std::string;
33 using gd::wstring;
34 using std::map;
35 using std::multimap;
36 using std::set;
37 using BtreeIndexing::WordArticleLink;
38 using BtreeIndexing::IndexedWords;
39 using BtreeIndexing::IndexInfo;
40 
41 namespace {
42 
43 DEF_EX( exInvalidData, "Invalid data encountered", Dictionary::Ex )
44 DEF_EX( exFailedToOpenVorbisData, "Failed to open Vorbis data", Dictionary::Ex )
45 DEF_EX( exFailedToSeekInVorbisData, "Failed to seek in Vorbis data", Dictionary::Ex )
46 DEF_EX( exFailedToRetrieveVorbisInfo, "Failed to retrieve Vorbis info", Dictionary::Ex )
47 
48 enum
49 {
50   Signature = 0x5841534c, // LSAX on little-endian, XASL on big-endian
51   CurrentFormatVersion = 5
52 };
53 
54 struct IdxHeader
55 {
56   uint32_t signature; // First comes the signature, BGLX
57   uint32_t formatVersion; // File format version, currently 1.
58   uint32_t soundsCount; // Total number of sounds, for informative purposes only
59   uint32_t vorbisOffset; // Offset of the vorbis file which contains all snds
60   uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
61   uint32_t indexRootOffset;
62 }
63 #ifndef _MSC_VER
64 __attribute__((packed))
65 #endif
66 ;
67 
indexIsOldOrBad(string const & indexFile)68 bool indexIsOldOrBad( string const & indexFile )
69 {
70   File::Class idx( indexFile, "rb" );
71 
72   IdxHeader header;
73 
74   return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
75          header.signature != Signature ||
76          header.formatVersion != CurrentFormatVersion;
77 }
78 
stripExtension(string const & str)79 string stripExtension( string const & str )
80 {
81   if ( str.size() > 3 &&
82       ( strcasecmp( str.c_str() + ( str.size() - 4 ), ".wav" ) == 0 ) )
83     return string( str, 0, str.size() - 4 );
84   else
85     return str;
86 }
87 
88 struct Entry
89 {
90   string name;
91 
92   uint32_t samplesLength;
93   uint32_t samplesOffset;
94 public:
95 
96   // Reads an entry from the file's current position
97   Entry( File::Class & f );
98 };
99 
Entry(File::Class & f)100 Entry::Entry( File::Class & f )
101 {
102   bool firstEntry = ( f.tell() == 13 );
103   // Read the entry's filename
104   size_t read = 0;
105 
106   vector< uint16_t > filenameBuffer( 64 );
107 
108   for( ; ; ++read )
109   {
110     if ( filenameBuffer.size() <= read )
111       filenameBuffer.resize( read + 64 );
112 
113     f.read( &filenameBuffer[ read ], 2 );
114 
115     if ( filenameBuffer[ read ] == 0xD )
116     {
117       if ( f.read< uint16_t >() != 0xA )
118         throw exInvalidData();
119 
120       // Filename ending marker
121       break;
122     }
123   }
124 
125   // Skip zero or ff, or just ff.
126 
127   if ( uint8_t x = f.read< uint8_t >() )
128   {
129     if ( x != 0xFF )
130       throw exInvalidData();
131   }
132   else
133   if ( f.read< uint8_t >() != 0xFF )
134     throw exInvalidData();
135 
136 
137   if ( !firstEntry )
138   {
139     // For all entries but the first one, read its offset in
140     // samples.
141     samplesOffset  = f.read< uint32_t >();
142 
143     if ( f.read< uint8_t >() != 0xFF )
144       throw exInvalidData();
145   }
146   else
147     samplesOffset = 0;
148 
149   // Read the size of the recording, in samples
150   samplesLength = f.read< uint32_t >();
151 
152   name = Iconv::toUtf8( Iconv::Utf16Le, &filenameBuffer.front(),
153                         read * sizeof( uint16_t ) );
154 }
155 
156 class LsaDictionary: public BtreeIndexing::BtreeDictionary
157 {
158   Mutex idxMutex;
159   File::Class idx;
160   IdxHeader idxHeader;
161 
162 public:
163 
164   LsaDictionary( string const & id, string const & indexFile,
165                  vector< string > const & dictionaryFiles );
166 
167   virtual string getName() throw();
168 
getProperties()169   virtual map< Dictionary::Property, string > getProperties() throw()
170   { return map< Dictionary::Property, string >(); }
171 
getArticleCount()172   virtual unsigned long getArticleCount() throw()
173   { return idxHeader.soundsCount; }
174 
getWordCount()175   virtual unsigned long getWordCount() throw()
176   { return getArticleCount(); }
177 
178   virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
179                                                       vector< wstring > const & alts,
180                                                       wstring const &,
181                                                       bool ignoreDiacritics )
182     THROW_SPEC( std::exception );
183 
184   virtual sptr< Dictionary::DataRequest > getResource( string const & name )
185     THROW_SPEC( std::exception );
186 
187 protected:
188 
189   virtual void loadIcon() throw();
190 };
191 
getName()192 string LsaDictionary::getName() throw()
193 {
194   string result = FsEncoding::basename( getDictionaryFilenames()[ 0 ] );
195 
196   // Strip the extension
197   result.erase( result.rfind( '.' ) );
198 
199   return result;
200 }
201 
LsaDictionary(string const & id,string const & indexFile,vector<string> const & dictionaryFiles)202 LsaDictionary::LsaDictionary( string const & id,
203                               string const & indexFile,
204                               vector< string > const & dictionaryFiles ):
205   BtreeDictionary( id, dictionaryFiles ),
206   idx( indexFile, "rb" ),
207   idxHeader( idx.read< IdxHeader >() )
208 {
209   // Initialize the index
210 
211   openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
212                         idxHeader.indexRootOffset ),
213              idx, idxMutex );
214 }
215 
getArticle(wstring const & word,vector<wstring> const & alts,wstring const &,bool ignoreDiacritics)216 sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word,
217                                                            vector< wstring > const & alts,
218                                                            wstring const &,
219                                                            bool ignoreDiacritics )
220   THROW_SPEC( std::exception )
221 {
222   vector< WordArticleLink > chain = findArticles( word, ignoreDiacritics );
223 
224   for( unsigned x = 0; x < alts.size(); ++x )
225   {
226     /// Make an additional query for each alt
227 
228     vector< WordArticleLink > altChain = findArticles( alts[ x ], ignoreDiacritics );
229 
230     chain.insert( chain.end(), altChain.begin(), altChain.end() );
231   }
232 
233   multimap< wstring, string > mainArticles, alternateArticles;
234 
235   set< uint32_t > articlesIncluded; // Some synonims make it that the articles
236                                     // appear several times. We combat this
237                                     // by only allowing them to appear once.
238 
239   wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
240   if( ignoreDiacritics )
241     wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
242 
243   for( unsigned x = 0; x < chain.size(); ++x )
244   {
245     if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
246       continue; // We already have this article in the body.
247 
248     // Ok. Now, does it go to main articles, or to alternate ones? We list
249     // main ones first, and alternates after.
250 
251     // We do the case-folded comparison here.
252 
253     wstring headwordStripped =
254       Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) );
255     if( ignoreDiacritics )
256       headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
257 
258     multimap< wstring, string > & mapToUse =
259       ( wordCaseFolded == headwordStripped ) ?
260         mainArticles : alternateArticles;
261 
262     mapToUse.insert( std::pair< wstring, string >(
263       Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) ), chain[ x ].word ) );
264 
265     articlesIncluded.insert( chain[ x ].articleOffset );
266   }
267 
268   if ( mainArticles.empty() && alternateArticles.empty() )
269     return new Dictionary::DataRequestInstant( false ); // No such word
270 
271   string result;
272 
273   multimap< wstring, string >::const_iterator i;
274 
275   result += "<table class=\"lsa_play\">";
276   for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
277   {
278     result += "<tr>";
279 
280     QUrl url;
281     url.setScheme( "gdau" );
282     url.setHost( QString::fromUtf8( getId().c_str() ) );
283     url.setPath( Qt4x5::Url::ensureLeadingSlash( QString::fromUtf8( i->second.c_str() ) ) );
284 
285     string ref = string( "\"" ) + url.toEncoded().data() + "\"";
286 
287     result += addAudioLink( ref, getId() );
288 
289     result += "<td><a href=" + ref + "><img src=\"qrcx://localhost/icons/playsound.png\" border=\"0\" alt=\"Play\"/></a></td>";
290     result += "<td><a href=" + ref + ">" + i->second + "</a></td>";
291     result += "</tr>";
292   }
293 
294   for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
295   {
296     result += "<tr>";
297 
298     QUrl url;
299     url.setScheme( "gdau" );
300     url.setHost( QString::fromUtf8( getId().c_str() ) );
301     url.setPath( Qt4x5::Url::ensureLeadingSlash( QString::fromUtf8( i->second.c_str() ) ) );
302 
303     string ref = string( "\"" ) + url.toEncoded().data() + "\"";
304 
305     result += addAudioLink( ref, getId() );
306 
307     result += "<td><a href=" + ref + "><img src=\"qrcx://localhost/icons/playsound.png\" border=\"0\" alt=\"Play\"/></a></td>";
308     result += "<td><a href=" + ref + ">" + i->second + "</a></td>";
309     result += "</tr>";
310   }
311 
312   result += "</table>";
313 
314   Dictionary::DataRequestInstant * ret =
315     new Dictionary::DataRequestInstant( true );
316 
317   ret->getData().resize( result.size() );
318 
319   memcpy( &(ret->getData().front()), result.data(), result.size() );
320 
321   return ret;
322 }
323 
324 /// This wraps around file operations
325 struct ShiftedVorbis
326 {
327   QFile & f;
328   size_t shift;
329 
ShiftedVorbisLsa::__anonda0c59650111::ShiftedVorbis330   ShiftedVorbis( QFile & f_, size_t shift_ ): f( f_ ), shift( shift_ )
331   {}
332 
333   static size_t read( void * ptr, size_t size, size_t nmemb, void * datasource );
334   static int seek( void * datasource, ogg_int64_t offset, int whence );
335   static long tell( void * datasource );
336 
337   static ov_callbacks callbacks;
338 };
339 
read(void * ptr,size_t size,size_t nmemb,void * datasource)340 size_t ShiftedVorbis::read( void * ptr, size_t size, size_t nmemb,
341                             void * datasource )
342 {
343   ShiftedVorbis * sv = ( ShiftedVorbis * ) datasource;
344 
345   return sv->f.read( reinterpret_cast<char *>( ptr ), size * nmemb );
346 }
347 
seek(void * datasource,ogg_int64_t offset,int whence)348 int ShiftedVorbis::seek( void * datasource, ogg_int64_t offset, int whence )
349 {
350   ShiftedVorbis * sv = ( ShiftedVorbis * ) datasource;
351 
352   if ( whence == SEEK_SET )
353     offset += sv->shift;
354 
355   if( whence == SEEK_CUR )
356     offset += sv->f.pos();
357 
358   if( whence == SEEK_END )
359     offset += sv->f.size();
360 
361   return sv->f.seek( offset );
362 }
363 
tell(void * datasource)364 long ShiftedVorbis::tell( void * datasource )
365 {
366   ShiftedVorbis * sv = ( ShiftedVorbis * ) datasource;
367   long result = sv->f.pos();
368 
369   if ( result != -1 )
370     result -= sv->shift;
371 
372   return result;
373 }
374 
375 ov_callbacks ShiftedVorbis::callbacks = { ShiftedVorbis::read,
376                                           ShiftedVorbis::seek,
377                                           NULL,
378                                           ShiftedVorbis::tell };
379 
380 // A crude .wav header which is sufficient for our needs
381 struct WavHeader
382 {
383   char riff[ 4 ]; // RIFF
384   uint32_t riffLength;
385   char waveAndFmt[ 8 ]; // WAVEfmt%20
386   uint32_t fmtLength; // 16
387   uint16_t formatTag; // 1
388   uint16_t channels; // 1 or 2
389   uint32_t samplesPerSec;
390   uint32_t bytesPerSec;
391   uint16_t blockAlign;
392   uint16_t bitsPerSample; // 16
393   char data[ 4 ]; // data
394   uint32_t dataLength;
395 }
396 #ifndef _MSC_VER
397 __attribute__((packed))
398 #endif
399 ;
400 
getResource(string const & name)401 sptr< Dictionary::DataRequest > LsaDictionary::getResource( string const & name )
402   THROW_SPEC( std::exception )
403 {
404   // See if the name ends in .wav. Remove that extension then
405 
406   string strippedName =
407     ( name.size() > 3 && ( name.compare( name.size() - 4, 4, ".wav" ) == 0 ) ) ?
408       string( name, 0, name.size() - 4 ) : name;
409 
410   vector< WordArticleLink > chain = findArticles( Utf8::decode( strippedName ) );
411 
412   if ( chain.empty() )
413     return new Dictionary::DataRequestInstant( false ); // No such resource
414 
415   File::Class f( getDictionaryFilenames()[ 0 ], "rb" );
416 
417   f.seek( chain[ 0 ].articleOffset );
418   Entry e( f );
419 
420   f.seek( idxHeader.vorbisOffset );
421 
422   ShiftedVorbis sv( f.file(), idxHeader.vorbisOffset );
423 
424   OggVorbis_File vf;
425 
426   int result = ov_open_callbacks( &sv, &vf, 0, 0, ShiftedVorbis::callbacks );
427 
428   if ( result )
429     throw exFailedToOpenVorbisData();
430 
431   if ( ov_pcm_seek( &vf, e.samplesOffset ) )
432     throw exFailedToSeekInVorbisData();
433 
434   vorbis_info * vi = ov_info( &vf, -1 );
435 
436   if ( !vi )
437   {
438     ov_clear( &vf );
439 
440     throw exFailedToRetrieveVorbisInfo();
441   }
442 
443   sptr< Dictionary::DataRequestInstant > dr = new
444     Dictionary::DataRequestInstant( true );
445 
446   vector< char > & data = dr->getData();
447 
448   data.resize( sizeof( WavHeader ) + e.samplesLength * 2 );
449 
450   WavHeader * wh = (WavHeader *)&data.front();
451 
452   memset( wh, 0, sizeof( *wh ) );
453 
454   memcpy( wh->riff, "RIFF", 4 );
455   wh->riffLength = data.size() - 8;
456 
457   memcpy( wh->waveAndFmt, "WAVEfmt ", 8 );
458   wh->fmtLength = 16;
459   wh->formatTag = 1;
460   wh->channels = vi->channels;
461   wh->samplesPerSec = vi->rate;
462   wh->bytesPerSec = vi->channels * vi->rate * 2;
463   wh->blockAlign = vi->channels * 2;
464   wh->bitsPerSample = 16;
465   memcpy( wh->data, "data", 4 );
466   wh->dataLength = data.size() - sizeof( *wh );
467 
468   // Now decode vorbis to the rest of the block
469 
470   char * ptr = &data.front() + sizeof( *wh );
471   int left = data.size() - sizeof( *wh );
472   int bitstream = 0;
473 
474   while( left )
475   {
476     long result = ov_read( &vf, ptr, left, 0, 2, 1, &bitstream );
477 
478     if ( result <= 0 )
479     {
480       gdWarning( "Failed to read Vorbis data (code = %ld)\n", result );
481       memset( ptr, 0, left );
482       break;
483     }
484 
485     if ( result > left )
486     {
487       GD_FDPRINTF( stderr, "Warning: Vorbis decode returned more data than requested.\n" );
488 
489       result = left;
490     }
491 
492     ptr += result;
493     left -= result;
494   }
495 
496   ov_clear( &vf );
497 
498   return dr;
499 }
500 
loadIcon()501 void LsaDictionary::loadIcon() throw()
502 {
503   if ( dictionaryIconLoaded )
504     return;
505 
506   QString fileName =
507     QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
508 
509   // Remove the extension
510   fileName.chop( 3 );
511 
512   if( !loadIconFromFile( fileName ) )
513   {
514     // Load failed -- use default icons
515     dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/playsound.png");
516   }
517 
518   dictionaryIconLoaded = true;
519 }
520 
521 }
522 
makeDictionaries(vector<string> const & fileNames,string const & indicesDir,Dictionary::Initializing & initializing)523 vector< sptr< Dictionary::Class > > makeDictionaries(
524                                       vector< string > const & fileNames,
525                                       string const & indicesDir,
526                                       Dictionary::Initializing & initializing )
527   THROW_SPEC( std::exception )
528 {
529   vector< sptr< Dictionary::Class > > dictionaries;
530 
531   for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
532        ++i )
533   {
534     /// Only allow .dat and .lsa extensions to save scanning time
535     if ( i->size() < 4 ||
536         ( strcasecmp( i->c_str() + ( i->size() - 4 ), ".dat" ) != 0 &&
537           strcasecmp( i->c_str() + ( i->size() - 4 ), ".lsa" ) != 0 ) )
538       continue;
539 
540     try
541     {
542       File::Class f( *i, "rb" );
543 
544       /// Check the signature
545 
546       char buf[ 9 ];
547 
548       if ( f.readRecords( buf, 9, 1 ) != 1 || memcmp( buf, "L\09\0S\0A\0\xff", 9 ) != 0 )
549       {
550         // The file is too short or the signature doesn't match -- skip this
551         // file
552         continue;
553       }
554 
555       vector< string > dictFiles( 1, *i );
556 
557       string dictId = Dictionary::makeDictionaryId( dictFiles );
558 
559       string indexFile = indicesDir + dictId;
560 
561       if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || indexIsOldOrBad( indexFile ) )
562       {
563         // Building the index
564 
565         gdDebug( "Lsa: Building the index for dictionary: %s\n", i->c_str() );
566 
567         initializing.indexingDictionary( FsEncoding::basename( *i ) );
568 
569         File::Class idx( indexFile, "wb" );
570 
571         IdxHeader idxHeader;
572 
573         memset( &idxHeader, 0, sizeof( idxHeader ) );
574 
575         // We write a dummy header first. At the end of the process the header
576         // will be rewritten with the right values.
577 
578         idx.write( idxHeader );
579 
580         IndexedWords indexedWords;
581 
582         /// XXX handle big-endian machines here!
583         uint32_t entriesCount = f.read< uint32_t >();
584 
585         GD_DPRINTF( "%s: %u entries\n", i->c_str(), entriesCount );
586 
587         idxHeader.soundsCount = entriesCount;
588 
589         vector< uint16_t > filenameBuffer;
590 
591         while( entriesCount-- )
592         {
593           uint32_t offset = f.tell();
594 
595           Entry e( f );
596 
597 
598           // Remove the extension, no need for that in the index
599           e.name = stripExtension( e.name );
600 
601           GD_DPRINTF( "Read filename %s (%u at %u)<\n", e.name.c_str(), e.samplesLength, e.samplesOffset );
602 
603           // Insert new entry into an index
604 
605           indexedWords.addWord( Utf8::decode( e.name ), offset );
606         }
607 
608         idxHeader.vorbisOffset = f.tell();
609 
610         // Make sure there's really some vobis data there
611 
612         char buf[ 4 ];
613 
614         f.read( buf, sizeof( buf ) );
615 
616         if ( strncmp( buf, "OggS", 4 ) != 0 )
617           throw exInvalidData();
618 
619         // Build the index
620 
621         IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
622 
623         idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
624         idxHeader.indexRootOffset = idxInfo.rootOffset;
625 
626          // That concludes it. Update the header.
627 
628         idxHeader.signature = Signature;
629         idxHeader.formatVersion = CurrentFormatVersion;
630 
631         idx.rewind();
632 
633         idx.write( &idxHeader, sizeof( idxHeader ) );
634       }
635 
636       dictionaries.push_back( new LsaDictionary( dictId,
637                                                  indexFile,
638                                                  dictFiles ) );
639     }
640     catch( std::exception & e )
641     {
642       gdWarning( "Lingvo's LSA reading failed: %s, error: %s\n",
643                  i->c_str(), e.what() );
644     }
645   }
646 
647   return dictionaries;
648 }
649 
650 
651 }
652