1 /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
3
4 #include "lsa.hh"
5 #include "file.hh"
6 #include "iconv.hh"
7 #include "folding.hh"
8 #include "utf8.hh"
9 #include "btreeidx.hh"
10 #include "fsencoding.hh"
11 #include "audiolink.hh"
12 #include "gddebug.hh"
13
14 #include <set>
15 #include <string>
16
17 #ifdef _MSC_VER
18 #include <stub_msvc.h>
19 #endif
20
21 #define OV_EXCLUDE_STATIC_CALLBACKS
22 #include <vorbis/vorbisfile.h>
23 #include <QDir>
24 #include <QUrl>
25 #include <QDebug>
26 #include <QFile>
27
28 #include "qt4x5.hh"
29
30 namespace Lsa {
31
32 using std::string;
33 using gd::wstring;
34 using std::map;
35 using std::multimap;
36 using std::set;
37 using BtreeIndexing::WordArticleLink;
38 using BtreeIndexing::IndexedWords;
39 using BtreeIndexing::IndexInfo;
40
41 namespace {
42
43 DEF_EX( exInvalidData, "Invalid data encountered", Dictionary::Ex )
44 DEF_EX( exFailedToOpenVorbisData, "Failed to open Vorbis data", Dictionary::Ex )
45 DEF_EX( exFailedToSeekInVorbisData, "Failed to seek in Vorbis data", Dictionary::Ex )
46 DEF_EX( exFailedToRetrieveVorbisInfo, "Failed to retrieve Vorbis info", Dictionary::Ex )
47
48 enum
49 {
50 Signature = 0x5841534c, // LSAX on little-endian, XASL on big-endian
51 CurrentFormatVersion = 5
52 };
53
54 struct IdxHeader
55 {
56 uint32_t signature; // First comes the signature, BGLX
57 uint32_t formatVersion; // File format version, currently 1.
58 uint32_t soundsCount; // Total number of sounds, for informative purposes only
59 uint32_t vorbisOffset; // Offset of the vorbis file which contains all snds
60 uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
61 uint32_t indexRootOffset;
62 }
63 #ifndef _MSC_VER
64 __attribute__((packed))
65 #endif
66 ;
67
indexIsOldOrBad(string const & indexFile)68 bool indexIsOldOrBad( string const & indexFile )
69 {
70 File::Class idx( indexFile, "rb" );
71
72 IdxHeader header;
73
74 return idx.readRecords( &header, sizeof( header ), 1 ) != 1 ||
75 header.signature != Signature ||
76 header.formatVersion != CurrentFormatVersion;
77 }
78
stripExtension(string const & str)79 string stripExtension( string const & str )
80 {
81 if ( str.size() > 3 &&
82 ( strcasecmp( str.c_str() + ( str.size() - 4 ), ".wav" ) == 0 ) )
83 return string( str, 0, str.size() - 4 );
84 else
85 return str;
86 }
87
88 struct Entry
89 {
90 string name;
91
92 uint32_t samplesLength;
93 uint32_t samplesOffset;
94 public:
95
96 // Reads an entry from the file's current position
97 Entry( File::Class & f );
98 };
99
Entry(File::Class & f)100 Entry::Entry( File::Class & f )
101 {
102 bool firstEntry = ( f.tell() == 13 );
103 // Read the entry's filename
104 size_t read = 0;
105
106 vector< uint16_t > filenameBuffer( 64 );
107
108 for( ; ; ++read )
109 {
110 if ( filenameBuffer.size() <= read )
111 filenameBuffer.resize( read + 64 );
112
113 f.read( &filenameBuffer[ read ], 2 );
114
115 if ( filenameBuffer[ read ] == 0xD )
116 {
117 if ( f.read< uint16_t >() != 0xA )
118 throw exInvalidData();
119
120 // Filename ending marker
121 break;
122 }
123 }
124
125 // Skip zero or ff, or just ff.
126
127 if ( uint8_t x = f.read< uint8_t >() )
128 {
129 if ( x != 0xFF )
130 throw exInvalidData();
131 }
132 else
133 if ( f.read< uint8_t >() != 0xFF )
134 throw exInvalidData();
135
136
137 if ( !firstEntry )
138 {
139 // For all entries but the first one, read its offset in
140 // samples.
141 samplesOffset = f.read< uint32_t >();
142
143 if ( f.read< uint8_t >() != 0xFF )
144 throw exInvalidData();
145 }
146 else
147 samplesOffset = 0;
148
149 // Read the size of the recording, in samples
150 samplesLength = f.read< uint32_t >();
151
152 name = Iconv::toUtf8( Iconv::Utf16Le, &filenameBuffer.front(),
153 read * sizeof( uint16_t ) );
154 }
155
156 class LsaDictionary: public BtreeIndexing::BtreeDictionary
157 {
158 Mutex idxMutex;
159 File::Class idx;
160 IdxHeader idxHeader;
161
162 public:
163
164 LsaDictionary( string const & id, string const & indexFile,
165 vector< string > const & dictionaryFiles );
166
167 virtual string getName() throw();
168
getProperties()169 virtual map< Dictionary::Property, string > getProperties() throw()
170 { return map< Dictionary::Property, string >(); }
171
getArticleCount()172 virtual unsigned long getArticleCount() throw()
173 { return idxHeader.soundsCount; }
174
getWordCount()175 virtual unsigned long getWordCount() throw()
176 { return getArticleCount(); }
177
178 virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
179 vector< wstring > const & alts,
180 wstring const &,
181 bool ignoreDiacritics )
182 THROW_SPEC( std::exception );
183
184 virtual sptr< Dictionary::DataRequest > getResource( string const & name )
185 THROW_SPEC( std::exception );
186
187 protected:
188
189 virtual void loadIcon() throw();
190 };
191
getName()192 string LsaDictionary::getName() throw()
193 {
194 string result = FsEncoding::basename( getDictionaryFilenames()[ 0 ] );
195
196 // Strip the extension
197 result.erase( result.rfind( '.' ) );
198
199 return result;
200 }
201
LsaDictionary(string const & id,string const & indexFile,vector<string> const & dictionaryFiles)202 LsaDictionary::LsaDictionary( string const & id,
203 string const & indexFile,
204 vector< string > const & dictionaryFiles ):
205 BtreeDictionary( id, dictionaryFiles ),
206 idx( indexFile, "rb" ),
207 idxHeader( idx.read< IdxHeader >() )
208 {
209 // Initialize the index
210
211 openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
212 idxHeader.indexRootOffset ),
213 idx, idxMutex );
214 }
215
getArticle(wstring const & word,vector<wstring> const & alts,wstring const &,bool ignoreDiacritics)216 sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word,
217 vector< wstring > const & alts,
218 wstring const &,
219 bool ignoreDiacritics )
220 THROW_SPEC( std::exception )
221 {
222 vector< WordArticleLink > chain = findArticles( word, ignoreDiacritics );
223
224 for( unsigned x = 0; x < alts.size(); ++x )
225 {
226 /// Make an additional query for each alt
227
228 vector< WordArticleLink > altChain = findArticles( alts[ x ], ignoreDiacritics );
229
230 chain.insert( chain.end(), altChain.begin(), altChain.end() );
231 }
232
233 multimap< wstring, string > mainArticles, alternateArticles;
234
235 set< uint32_t > articlesIncluded; // Some synonims make it that the articles
236 // appear several times. We combat this
237 // by only allowing them to appear once.
238
239 wstring wordCaseFolded = Folding::applySimpleCaseOnly( word );
240 if( ignoreDiacritics )
241 wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
242
243 for( unsigned x = 0; x < chain.size(); ++x )
244 {
245 if ( articlesIncluded.find( chain[ x ].articleOffset ) != articlesIncluded.end() )
246 continue; // We already have this article in the body.
247
248 // Ok. Now, does it go to main articles, or to alternate ones? We list
249 // main ones first, and alternates after.
250
251 // We do the case-folded comparison here.
252
253 wstring headwordStripped =
254 Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) );
255 if( ignoreDiacritics )
256 headwordStripped = Folding::applyDiacriticsOnly( headwordStripped );
257
258 multimap< wstring, string > & mapToUse =
259 ( wordCaseFolded == headwordStripped ) ?
260 mainArticles : alternateArticles;
261
262 mapToUse.insert( std::pair< wstring, string >(
263 Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) ), chain[ x ].word ) );
264
265 articlesIncluded.insert( chain[ x ].articleOffset );
266 }
267
268 if ( mainArticles.empty() && alternateArticles.empty() )
269 return new Dictionary::DataRequestInstant( false ); // No such word
270
271 string result;
272
273 multimap< wstring, string >::const_iterator i;
274
275 result += "<table class=\"lsa_play\">";
276 for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
277 {
278 result += "<tr>";
279
280 QUrl url;
281 url.setScheme( "gdau" );
282 url.setHost( QString::fromUtf8( getId().c_str() ) );
283 url.setPath( Qt4x5::Url::ensureLeadingSlash( QString::fromUtf8( i->second.c_str() ) ) );
284
285 string ref = string( "\"" ) + url.toEncoded().data() + "\"";
286
287 result += addAudioLink( ref, getId() );
288
289 result += "<td><a href=" + ref + "><img src=\"qrcx://localhost/icons/playsound.png\" border=\"0\" alt=\"Play\"/></a></td>";
290 result += "<td><a href=" + ref + ">" + i->second + "</a></td>";
291 result += "</tr>";
292 }
293
294 for( i = alternateArticles.begin(); i != alternateArticles.end(); ++i )
295 {
296 result += "<tr>";
297
298 QUrl url;
299 url.setScheme( "gdau" );
300 url.setHost( QString::fromUtf8( getId().c_str() ) );
301 url.setPath( Qt4x5::Url::ensureLeadingSlash( QString::fromUtf8( i->second.c_str() ) ) );
302
303 string ref = string( "\"" ) + url.toEncoded().data() + "\"";
304
305 result += addAudioLink( ref, getId() );
306
307 result += "<td><a href=" + ref + "><img src=\"qrcx://localhost/icons/playsound.png\" border=\"0\" alt=\"Play\"/></a></td>";
308 result += "<td><a href=" + ref + ">" + i->second + "</a></td>";
309 result += "</tr>";
310 }
311
312 result += "</table>";
313
314 Dictionary::DataRequestInstant * ret =
315 new Dictionary::DataRequestInstant( true );
316
317 ret->getData().resize( result.size() );
318
319 memcpy( &(ret->getData().front()), result.data(), result.size() );
320
321 return ret;
322 }
323
324 /// This wraps around file operations
325 struct ShiftedVorbis
326 {
327 QFile & f;
328 size_t shift;
329
ShiftedVorbisLsa::__anonda0c59650111::ShiftedVorbis330 ShiftedVorbis( QFile & f_, size_t shift_ ): f( f_ ), shift( shift_ )
331 {}
332
333 static size_t read( void * ptr, size_t size, size_t nmemb, void * datasource );
334 static int seek( void * datasource, ogg_int64_t offset, int whence );
335 static long tell( void * datasource );
336
337 static ov_callbacks callbacks;
338 };
339
read(void * ptr,size_t size,size_t nmemb,void * datasource)340 size_t ShiftedVorbis::read( void * ptr, size_t size, size_t nmemb,
341 void * datasource )
342 {
343 ShiftedVorbis * sv = ( ShiftedVorbis * ) datasource;
344
345 return sv->f.read( reinterpret_cast<char *>( ptr ), size * nmemb );
346 }
347
seek(void * datasource,ogg_int64_t offset,int whence)348 int ShiftedVorbis::seek( void * datasource, ogg_int64_t offset, int whence )
349 {
350 ShiftedVorbis * sv = ( ShiftedVorbis * ) datasource;
351
352 if ( whence == SEEK_SET )
353 offset += sv->shift;
354
355 if( whence == SEEK_CUR )
356 offset += sv->f.pos();
357
358 if( whence == SEEK_END )
359 offset += sv->f.size();
360
361 return sv->f.seek( offset );
362 }
363
tell(void * datasource)364 long ShiftedVorbis::tell( void * datasource )
365 {
366 ShiftedVorbis * sv = ( ShiftedVorbis * ) datasource;
367 long result = sv->f.pos();
368
369 if ( result != -1 )
370 result -= sv->shift;
371
372 return result;
373 }
374
375 ov_callbacks ShiftedVorbis::callbacks = { ShiftedVorbis::read,
376 ShiftedVorbis::seek,
377 NULL,
378 ShiftedVorbis::tell };
379
380 // A crude .wav header which is sufficient for our needs
381 struct WavHeader
382 {
383 char riff[ 4 ]; // RIFF
384 uint32_t riffLength;
385 char waveAndFmt[ 8 ]; // WAVEfmt%20
386 uint32_t fmtLength; // 16
387 uint16_t formatTag; // 1
388 uint16_t channels; // 1 or 2
389 uint32_t samplesPerSec;
390 uint32_t bytesPerSec;
391 uint16_t blockAlign;
392 uint16_t bitsPerSample; // 16
393 char data[ 4 ]; // data
394 uint32_t dataLength;
395 }
396 #ifndef _MSC_VER
397 __attribute__((packed))
398 #endif
399 ;
400
getResource(string const & name)401 sptr< Dictionary::DataRequest > LsaDictionary::getResource( string const & name )
402 THROW_SPEC( std::exception )
403 {
404 // See if the name ends in .wav. Remove that extension then
405
406 string strippedName =
407 ( name.size() > 3 && ( name.compare( name.size() - 4, 4, ".wav" ) == 0 ) ) ?
408 string( name, 0, name.size() - 4 ) : name;
409
410 vector< WordArticleLink > chain = findArticles( Utf8::decode( strippedName ) );
411
412 if ( chain.empty() )
413 return new Dictionary::DataRequestInstant( false ); // No such resource
414
415 File::Class f( getDictionaryFilenames()[ 0 ], "rb" );
416
417 f.seek( chain[ 0 ].articleOffset );
418 Entry e( f );
419
420 f.seek( idxHeader.vorbisOffset );
421
422 ShiftedVorbis sv( f.file(), idxHeader.vorbisOffset );
423
424 OggVorbis_File vf;
425
426 int result = ov_open_callbacks( &sv, &vf, 0, 0, ShiftedVorbis::callbacks );
427
428 if ( result )
429 throw exFailedToOpenVorbisData();
430
431 if ( ov_pcm_seek( &vf, e.samplesOffset ) )
432 throw exFailedToSeekInVorbisData();
433
434 vorbis_info * vi = ov_info( &vf, -1 );
435
436 if ( !vi )
437 {
438 ov_clear( &vf );
439
440 throw exFailedToRetrieveVorbisInfo();
441 }
442
443 sptr< Dictionary::DataRequestInstant > dr = new
444 Dictionary::DataRequestInstant( true );
445
446 vector< char > & data = dr->getData();
447
448 data.resize( sizeof( WavHeader ) + e.samplesLength * 2 );
449
450 WavHeader * wh = (WavHeader *)&data.front();
451
452 memset( wh, 0, sizeof( *wh ) );
453
454 memcpy( wh->riff, "RIFF", 4 );
455 wh->riffLength = data.size() - 8;
456
457 memcpy( wh->waveAndFmt, "WAVEfmt ", 8 );
458 wh->fmtLength = 16;
459 wh->formatTag = 1;
460 wh->channels = vi->channels;
461 wh->samplesPerSec = vi->rate;
462 wh->bytesPerSec = vi->channels * vi->rate * 2;
463 wh->blockAlign = vi->channels * 2;
464 wh->bitsPerSample = 16;
465 memcpy( wh->data, "data", 4 );
466 wh->dataLength = data.size() - sizeof( *wh );
467
468 // Now decode vorbis to the rest of the block
469
470 char * ptr = &data.front() + sizeof( *wh );
471 int left = data.size() - sizeof( *wh );
472 int bitstream = 0;
473
474 while( left )
475 {
476 long result = ov_read( &vf, ptr, left, 0, 2, 1, &bitstream );
477
478 if ( result <= 0 )
479 {
480 gdWarning( "Failed to read Vorbis data (code = %ld)\n", result );
481 memset( ptr, 0, left );
482 break;
483 }
484
485 if ( result > left )
486 {
487 GD_FDPRINTF( stderr, "Warning: Vorbis decode returned more data than requested.\n" );
488
489 result = left;
490 }
491
492 ptr += result;
493 left -= result;
494 }
495
496 ov_clear( &vf );
497
498 return dr;
499 }
500
loadIcon()501 void LsaDictionary::loadIcon() throw()
502 {
503 if ( dictionaryIconLoaded )
504 return;
505
506 QString fileName =
507 QDir::fromNativeSeparators( FsEncoding::decode( getDictionaryFilenames()[ 0 ].c_str() ) );
508
509 // Remove the extension
510 fileName.chop( 3 );
511
512 if( !loadIconFromFile( fileName ) )
513 {
514 // Load failed -- use default icons
515 dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/playsound.png");
516 }
517
518 dictionaryIconLoaded = true;
519 }
520
521 }
522
makeDictionaries(vector<string> const & fileNames,string const & indicesDir,Dictionary::Initializing & initializing)523 vector< sptr< Dictionary::Class > > makeDictionaries(
524 vector< string > const & fileNames,
525 string const & indicesDir,
526 Dictionary::Initializing & initializing )
527 THROW_SPEC( std::exception )
528 {
529 vector< sptr< Dictionary::Class > > dictionaries;
530
531 for( vector< string >::const_iterator i = fileNames.begin(); i != fileNames.end();
532 ++i )
533 {
534 /// Only allow .dat and .lsa extensions to save scanning time
535 if ( i->size() < 4 ||
536 ( strcasecmp( i->c_str() + ( i->size() - 4 ), ".dat" ) != 0 &&
537 strcasecmp( i->c_str() + ( i->size() - 4 ), ".lsa" ) != 0 ) )
538 continue;
539
540 try
541 {
542 File::Class f( *i, "rb" );
543
544 /// Check the signature
545
546 char buf[ 9 ];
547
548 if ( f.readRecords( buf, 9, 1 ) != 1 || memcmp( buf, "L\09\0S\0A\0\xff", 9 ) != 0 )
549 {
550 // The file is too short or the signature doesn't match -- skip this
551 // file
552 continue;
553 }
554
555 vector< string > dictFiles( 1, *i );
556
557 string dictId = Dictionary::makeDictionaryId( dictFiles );
558
559 string indexFile = indicesDir + dictId;
560
561 if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) || indexIsOldOrBad( indexFile ) )
562 {
563 // Building the index
564
565 gdDebug( "Lsa: Building the index for dictionary: %s\n", i->c_str() );
566
567 initializing.indexingDictionary( FsEncoding::basename( *i ) );
568
569 File::Class idx( indexFile, "wb" );
570
571 IdxHeader idxHeader;
572
573 memset( &idxHeader, 0, sizeof( idxHeader ) );
574
575 // We write a dummy header first. At the end of the process the header
576 // will be rewritten with the right values.
577
578 idx.write( idxHeader );
579
580 IndexedWords indexedWords;
581
582 /// XXX handle big-endian machines here!
583 uint32_t entriesCount = f.read< uint32_t >();
584
585 GD_DPRINTF( "%s: %u entries\n", i->c_str(), entriesCount );
586
587 idxHeader.soundsCount = entriesCount;
588
589 vector< uint16_t > filenameBuffer;
590
591 while( entriesCount-- )
592 {
593 uint32_t offset = f.tell();
594
595 Entry e( f );
596
597
598 // Remove the extension, no need for that in the index
599 e.name = stripExtension( e.name );
600
601 GD_DPRINTF( "Read filename %s (%u at %u)<\n", e.name.c_str(), e.samplesLength, e.samplesOffset );
602
603 // Insert new entry into an index
604
605 indexedWords.addWord( Utf8::decode( e.name ), offset );
606 }
607
608 idxHeader.vorbisOffset = f.tell();
609
610 // Make sure there's really some vobis data there
611
612 char buf[ 4 ];
613
614 f.read( buf, sizeof( buf ) );
615
616 if ( strncmp( buf, "OggS", 4 ) != 0 )
617 throw exInvalidData();
618
619 // Build the index
620
621 IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
622
623 idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
624 idxHeader.indexRootOffset = idxInfo.rootOffset;
625
626 // That concludes it. Update the header.
627
628 idxHeader.signature = Signature;
629 idxHeader.formatVersion = CurrentFormatVersion;
630
631 idx.rewind();
632
633 idx.write( &idxHeader, sizeof( idxHeader ) );
634 }
635
636 dictionaries.push_back( new LsaDictionary( dictId,
637 indexFile,
638 dictFiles ) );
639 }
640 catch( std::exception & e )
641 {
642 gdWarning( "Lingvo's LSA reading failed: %s, error: %s\n",
643 i->c_str(), e.what() );
644 }
645 }
646
647 return dictionaries;
648 }
649
650
651 }
652