1 /** \file tinydict.cpp
2 \brief .dict dictionary file support implementation
3
4 (c) Vadim Lopatin, 2009
5
6 This source code is distributed under the terms of
7 GNU General Public License.
8
9 See LICENSE file for details.
10
11 */
12
13 #include <stdlib.h>
14 #include "tinydict.h"
15
16
17 /// add word to list
add(TinyDictWord * word)18 void TinyDictWordList::add( TinyDictWord * word )
19 {
20 if ( count>=size ) {
21 size = size ? size * 2 : 32;
22 list = (TinyDictWord **)realloc( list, sizeof(TinyDictWord *) * size );
23 }
24 list[ count++ ] = word;
25 }
26
27 /// clear list
clear()28 void TinyDictWordList::clear()
29 {
30 if ( list ) {
31 for ( int i=0; i<count; i++ )
32 delete list[i];
33 free( list );
34 list = NULL;
35 count = size = 0;
36 }
37 }
38
39 /// empty list constructor
TinyDictWordList()40 TinyDictWordList::TinyDictWordList() : dict(NULL), list(NULL), size(0), count(0) { }
41
42 /// destructor
~TinyDictWordList()43 TinyDictWordList::~TinyDictWordList() { clear(); }
44
45
46 ///
47 class TinyDictFileBase
48 {
49 protected:
50 char * fname;
51 FILE * f;
52 size_t size;
setFilename(const char * filename)53 void setFilename( const char * filename )
54 {
55 if ( fname )
56 free( fname );
57 if ( filename && *filename )
58 fname = strdup( filename );
59 else
60 fname = NULL;
61 }
62 public:
TinyDictFileBase()63 TinyDictFileBase() : fname(NULL), f(NULL), size(0)
64 {
65 }
~TinyDictFileBase()66 virtual ~TinyDictFileBase()
67 {
68 close();
69 setFilename( NULL );
70 }
close()71 virtual void close()
72 {
73 if (f)
74 fclose(f);
75 f = NULL;
76 size = 0;
77 }
78 };
79
80 class TinyDictIndexFile : public TinyDictFileBase
81 {
82 int factor;
83 int count;
84 TinyDictWordList list;
85 public:
86
compact()87 void compact()
88 {
89 // do nothing
90 }
91
92 bool find( const char * prefix, bool exactMatch, TinyDictWordList & words );
93
TinyDictIndexFile()94 TinyDictIndexFile() : factor( 16 ), count(0)
95 {
96 }
97
~TinyDictIndexFile()98 virtual ~TinyDictIndexFile()
99 {
100 }
101
102 bool open( const char * filename );
103
104 };
105
106 class TinyDictCRC
107 {
108 unsigned crc;
109 public:
reset()110 void reset()
111 {
112 crc = crc32( 0L, Z_NULL, 0 );
113 }
get()114 unsigned get()
115 {
116 return crc;
117 }
update(const void * data,unsigned size)118 unsigned update( const void * data, unsigned size )
119 {
120 crc = crc32( crc, (const unsigned char *)data, size );
121 return crc;
122 }
update(unsigned char b)123 unsigned update( unsigned char b )
124 {
125 return update( &b, sizeof(b) );
126 }
update(unsigned short b)127 unsigned update( unsigned short b )
128 {
129 return update( &b, sizeof(b) );
130 }
update(unsigned int b)131 unsigned update( unsigned int b )
132 {
133 return update( &b, sizeof(b) );
134 }
TinyDictCRC()135 TinyDictCRC()
136 {
137 reset();
138 }
139 };
140
141 class TinyDictZStream
142 {
143 FILE * f;
144 TinyDictCRC crc;
145
146 int type;
147 unsigned size;
148 unsigned txtpos;
149
150 unsigned headerLength;
151 bool error;
152 unsigned short * chunks;
153 unsigned int * offsets;
154 unsigned extraLength;
155 unsigned char subfieldID1;
156 unsigned char subfieldID2;
157 unsigned subfieldLength;
158 unsigned subfieldVersion;
159 unsigned chunkLength;
160 unsigned chunkCount;
161
162 bool zInitialized;
163 z_stream zStream;
164 unsigned packed_size;
165 unsigned char * unp_buffer;
166 unsigned unp_buffer_start;
167 unsigned unp_buffer_len;
168 unsigned unp_buffer_size;
169
readBytes(unsigned char * buf,unsigned size)170 unsigned int readBytes( unsigned char * buf, unsigned size )
171 {
172 if ( error || !f )
173 return 0;
174 unsigned int bytesRead = fread( buf, 1, size, f );
175 crc.update( buf, bytesRead );
176 return bytesRead;
177 }
178
readU32()179 unsigned int readU32()
180 {
181 unsigned char buf[4];
182 if ( !error && f && fread( buf, 1, 4, f )==4 ) {
183 crc.update( buf, 4 );
184 return (((((((unsigned int)buf[3]) << 8) + buf[2]) << 8) + buf[1]) << 8 ) + buf[0];
185 }
186 error = true;
187 return 0;
188 }
189
readU16()190 unsigned short readU16()
191 {
192 unsigned char buf[2];
193 if ( !error && f && fread( buf, 1, 2, f )==2 ) {
194 crc.update( buf, 2 );
195 return (((unsigned short)buf[1]) << 8) + buf[0];
196 }
197 error = true;
198 return 0;
199 }
200
readU8()201 unsigned char readU8()
202 {
203 unsigned char buf[1];
204 if ( !error && f && fread( buf, 1, 1, f )==1 ) {
205 crc.update( buf, 1 );
206 return buf[0];
207 }
208 error = true;
209 return 0;
210 }
211
212 bool zinit(unsigned char * next_in, unsigned avail_in, unsigned char * next_out, unsigned avail_out);
213
214 bool zclose();
215
216 bool readChunk( unsigned n );
217
218 public:
219 /// minimize memory consumption
220 void compact();
221 /// get unpacked data size
getSize()222 unsigned getSize() { return size; }
223 /// create uninitialized stream
224 TinyDictZStream();
225 /// open from file
226 bool open( FILE * file );
227 /// read block of data
228 bool read( unsigned char * buf, unsigned start, unsigned len );
229 /// close stream
230 ~TinyDictZStream();
231 };
232
233 class TinyDictDataFile : public TinyDictFileBase
234 {
235 bool compressed;
236 char * buf;
237 int buf_size;
238
239 TinyDictZStream zstream;
240
reserve(int sz)241 void reserve( int sz )
242 {
243 if ( buf_size < sz ) {
244 char * oldptr = buf;
245 buf = (char*) realloc( buf, sizeof(char) * sz );
246 if ( !buf) {
247 free(oldptr);
248 fprintf(stderr, "out of memory\n");
249 exit(-2);
250 }
251 buf_size = sz;
252 }
253 }
254
255
256 public:
257
compact()258 void compact()
259 {
260 zstream.compact();
261 if ( buf ) {
262 free( buf );
263 buf = NULL;
264 buf_size = 0;
265 }
266 }
267
268 const char * read( const TinyDictWord * w );
269
270 bool open( const char * filename );
271
TinyDictDataFile()272 TinyDictDataFile() : compressed(false), buf(0), buf_size(0)
273 {
274 }
275
~TinyDictDataFile()276 virtual ~TinyDictDataFile()
277 {
278 if ( buf )
279 free( buf );
280 }
281 };
282
283
284 static int base64table[128] = { 0 };
285 static const char * base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
286
parseBase64(const char * str)287 static unsigned parseBase64( const char * str )
288 {
289 int i;
290 if ( !*base64table ) {
291 for ( i=0; i<128; i++ )
292 base64table[i] = -1;
293 for ( i=0; base64chars[i]; i++ )
294 base64table[(unsigned)base64chars[i]] = i;
295 }
296 unsigned n = 0;
297 for ( ; *str; str++ ) {
298 int code = base64table[ (unsigned)*str ];
299 if ( code<0 )
300 return (unsigned)-1;
301 n = ( n << 6 ) + code;
302 }
303 return n;
304 }
305
compare(const char * str) const306 int TinyDictWord::compare( const char * str ) const
307 {
308 return strcmp( word, str );
309 }
310
my_fgets(char * buf,int size,FILE * f)311 static int my_fgets( char * buf, int size, FILE * f )
312 {
313 int i=0;
314 for ( ; i<size; i++ ) {
315 int ch = fgetc( f );
316 if ( ch == '\n' )
317 break;
318 if ( ch > 0 )
319 buf[i] = (char) ch;
320 else
321 break;
322 }
323 buf[i] = 0;
324 return i;
325 }
326
327 /// factory - reading from index file
read(FILE * f,unsigned index)328 TinyDictWord * TinyDictWord::read( FILE * f, unsigned index )
329 {
330 if ( !f || feof(f) )
331 return NULL;
332 char buf[1024];
333 unsigned indexpos = ftell( f );
334 int sz = my_fgets( buf, 1023, f );
335 if ( !sz )
336 return NULL;
337 int tabc = 0;
338 int tabs[2];
339 for ( int i=0; buf[i]; i++ ) {
340 if ( buf[i] == '\t' && tabc<2 )
341 tabs[tabc++] = i;
342 }
343 if ( tabc!=2 )
344 return NULL;
345 const char * word = buf;
346 const char * pos_str = buf + tabs[0] + 1;
347 const char * len_str = buf + tabs[1] + 1;
348 buf[tabs[0]] = 0;
349 buf[tabs[1]] = 0;
350 unsigned start = parseBase64( pos_str );
351 unsigned len = parseBase64( len_str );
352 if ( start==(unsigned)-1 || len==(unsigned)-1 )
353 return NULL;
354 return new TinyDictWord( index, indexpos, start, len, word );
355 }
356
find(const char * prefix)357 int TinyDictWordList::find( const char * prefix )
358 {
359 if ( !count )
360 return -1;
361 int a = 0;
362 int b = count;
363 for ( ;a < b-1; ) {
364 int c = (a + b) / 2;
365 int res = list[c]->compare( prefix );
366 if ( !res )
367 return c;
368 if ( res < 0 ) {
369 a = c + 1;
370 } else {
371 b = c;
372 }
373 }
374 if ( a==0 || list[a]->compare( prefix )<0 )
375 return a;
376 return a - 1;
377 }
378
match(const char * str,bool exact) const379 bool TinyDictWord::match( const char * str, bool exact ) const
380 {
381 if ( exact )
382 return !strcmp( word, str );
383 int i=0;
384 for ( ; str[i]; i++ ) {
385 if ( str[i] != word[i] )
386 return false;
387 }
388 return str[i]==0;
389 }
390
find(const char * prefix,bool exactMatch,TinyDictWordList & words)391 bool TinyDictIndexFile::find( const char * prefix, bool exactMatch, TinyDictWordList & words )
392 {
393 words.clear();
394 int n = list.find( prefix );
395 if ( n<0 )
396 return false;
397 TinyDictWord * p = list.get( n );
398 if ( fseek( f, p->getIndexPos(), SEEK_SET ) )
399 return false;
400 int index = p->getIndex();
401 for ( ;; ) {
402 p = TinyDictWord::read( f, index++ );
403 if ( !p )
404 break;
405 int res = p->compare( prefix );
406 if ( p->match( prefix, exactMatch ) )
407 words.add( p );
408 else {
409 delete p;
410 if ( res > 0 )
411 break;
412 }
413 }
414 return true;
415 }
416
open(const char * filename)417 bool TinyDictIndexFile::open( const char * filename )
418 {
419 close();
420 if ( filename )
421 setFilename( filename );
422 if ( !fname )
423 return false;
424 f = fopen( fname, "rb" );
425 if ( !f )
426 return false;
427 if ( fseek( f, 0, SEEK_END ) ) {
428 close();
429 return false;
430 }
431 size = ftell( f );
432 if ( fseek( f, 0, SEEK_SET ) ) {
433 close();
434 return false;
435 }
436 // test
437 TinyDictWord * p;
438 count = 0;
439 for ( ;; count++ ) {
440 p = TinyDictWord::read( f, count );
441 if ( !p )
442 break;
443 if ( (count % factor) == 0 ) {
444 list.add( p );
445 } else {
446 delete p;
447 }
448 }
449 printf("%d words read from index\n", count);
450 return true;
451 }
452
453 enum {
454 DICT_TEXT,
455 DICT_GZIP,
456 DICT_DZIP
457 };
458
zinit(unsigned char * next_in,unsigned avail_in,unsigned char * next_out,unsigned avail_out)459 bool TinyDictZStream::zinit( unsigned char * next_in, unsigned avail_in, unsigned char * next_out, unsigned avail_out )
460 {
461 zclose();
462 if ( !zInitialized ) {
463 zStream.zalloc = NULL;
464 zStream.zfree = NULL;
465 zStream.opaque = NULL;
466 zStream.next_in = next_in;
467 zStream.avail_in = avail_in;
468 zStream.next_out = next_out;
469 zStream.avail_out = avail_out;
470 if (inflateInit2( &zStream, -15 ) != Z_OK ) {
471 // zlib initialization failed
472 return false;
473 }
474 zInitialized = true;
475 }
476 return true;
477 }
478
zclose()479 bool TinyDictZStream::zclose()
480 {
481 if ( zInitialized ) {
482 inflateEnd( &zStream );
483 zInitialized = false;
484 }
485 return true;
486 }
487
compact()488 void TinyDictZStream::compact()
489 {
490 if ( unp_buffer ) {
491 free( unp_buffer );
492 unp_buffer_start = unp_buffer_len = unp_buffer_size = 0;
493 unp_buffer = NULL;
494 }
495 }
496
readChunk(unsigned n)497 bool TinyDictZStream::readChunk( unsigned n )
498 {
499 if ( n >= chunkCount )
500 return false;
501 if ( !unp_buffer ) {
502 unp_buffer = (unsigned char *)malloc( sizeof(unsigned char)*chunkLength );
503 unp_buffer_size = chunkLength;
504 }
505 unp_buffer_start = n * chunkLength;
506
507 if ( fseek( f, offsets[ n ], SEEK_SET ) ) {
508 printf( "cannot seek to %d position\n", offsets[n] );
509 return false;
510 }
511 unsigned packsz = chunks[n];
512 unsigned char * tmp = (unsigned char *)malloc( sizeof(unsigned char) * packsz );
513
514 crc.reset();
515 unsigned int bytesRead = readBytes( tmp, packsz );
516 unsigned crc1 = crc.get();
517 unsigned crc2 = readU32();
518 if ( bytesRead != packsz || error ) {
519 printf( "error reading packed data\n" );
520 free( tmp );
521 return false;
522 }
523 if ( crc1!=crc2 ) {
524 printf( "CRC error: real: %08x expected: %08x\n", crc1, crc2 );
525 //free( tmp );
526 //return false;
527 }
528 zclose();
529 if ( !zinit(tmp, packsz, unp_buffer, unp_buffer_size) ) {
530 printf("cannot init deflater\n");
531 return false;
532 }
533 printf("unpacking %d bytes\n", packsz);
534 int err = inflate( &zStream, Z_PARTIAL_FLUSH );
535 printf("inflate result: %d\n", err);
536 if ( err != Z_OK ) {
537 printf("Inflate error %s (%d). avail_in=%d, avail_out=%d \n", zStream.msg, err, (int)zStream.avail_in, (int)zStream.avail_out);
538 free( tmp );
539 return false;
540 }
541 if ( zStream.avail_in ) {
542 printf("Inflate: not all data read, still %d bytes available\n", (int)zStream.avail_in );
543 free( tmp );
544 return false;
545 }
546 unp_buffer_len = unp_buffer_size - zStream.avail_out;
547
548 printf("freeing tmp\n");
549 free( tmp );
550 printf("done\n");
551
552
553
554 if ( n < chunkCount-1 && unp_buffer_len!=chunkLength ) {
555 printf("wrong chunk length\n");
556 return false; // too short chunk data
557 }
558
559
560 zclose();
561 return true;
562 }
563
read(unsigned char * buf,unsigned start,unsigned len)564 bool TinyDictZStream::read( unsigned char * buf, unsigned start, unsigned len )
565 {
566 if ( start >= unp_buffer_start && start < unp_buffer_start + unp_buffer_len ) {
567 unsigned readyBytes = unp_buffer_len - (start-unp_buffer_start);
568 if ( readyBytes > len )
569 readyBytes = len;
570 memcpy( buf, unp_buffer + (start-unp_buffer_start), readyBytes );
571 buf += readyBytes;
572 start += readyBytes;
573 len -= readyBytes;
574 if ( !len )
575 return true;
576 }
577 unsigned n = start / chunkLength;
578 if ( !readChunk( n ) )
579 return false;
580 unsigned readyBytes = unp_buffer_len - (start-unp_buffer_start);
581 if ( readyBytes > len )
582 readyBytes = len;
583 memcpy( buf, unp_buffer + (start-unp_buffer_start), readyBytes );
584 buf += readyBytes;
585 start += readyBytes;
586 len -= readyBytes;
587 if ( !len )
588 return true;
589 return false;
590 }
591
TinyDictZStream()592 TinyDictZStream::TinyDictZStream()
593 : f ( NULL ), size( 0 ), txtpos(0)
594 , headerLength(0), error( false )
595 , chunks(NULL), offsets(NULL), chunkLength(0), chunkCount(0)
596 , zInitialized(false), packed_size(0), unp_buffer(NULL), unp_buffer_start(0), unp_buffer_len(0), unp_buffer_size(0)
597 {
598 memset( &zStream, 0, sizeof(zStream) );
599 }
600
~TinyDictZStream()601 TinyDictZStream::~TinyDictZStream()
602 {
603 zclose();
604 if ( unp_buffer )
605 free( unp_buffer );
606 if ( chunks )
607 delete [] chunks;
608 if ( offsets )
609 delete [] offsets;
610 }
611
open(FILE * file)612 bool TinyDictZStream::open( FILE * file )
613 {
614 f = file;
615 error = false;
616 if ( fseek( f, 0, SEEK_END ) ) {
617 return false;
618 }
619 packed_size = ftell( f );
620 if ( fseek( f, 0, SEEK_SET ) ) {
621 return false;
622 }
623
624 crc.reset();
625 unsigned char header[10];
626 if ( fread( header, 1, sizeof(header), f )!=sizeof(header) ) {
627 return false;
628 }
629 crc.update( header, sizeof(header) );
630 if ( header[0]!=0x1f || header[1]!=0x8b ) { // 0x1F 0x8B -- GZIP magic
631 type = DICT_TEXT;
632 return true;
633 }
634 if ( header[2]!=8 ) {
635 // unknown compression method
636 return false;
637 }
638 unsigned char flg = header[3];
639 headerLength = 10;
640
641 //const char FTEXT = 1; // Extra text
642 const char FHCRC = 2; // Header CRC
643 const char FEXTRA = 4; // Extra field
644 const char FNAME = 8; // File name
645 const char FCOMMENT = 16; // File comment
646
647 type = DICT_GZIP;
648
649 packed_size = size;
650
651 // Optional extra field
652 if ( flg & FEXTRA ) {
653 type = DICT_DZIP;
654 extraLength = readU16();
655 headerLength += extraLength + 2;
656 subfieldID1 = readU8();
657 subfieldID2 = readU8();
658 subfieldLength = readU16(); // 2 bytes subfield length
659 subfieldVersion = readU16(); // 2 bytes subfield version
660 chunkLength = readU16(); // 2 bytes chunk length
661 chunkCount = readU16(); // 2 bytes chunk count
662 if ( error ) {
663 return false;
664 }
665 chunks = new unsigned short[ chunkCount ];
666 for (unsigned i=0; i<chunkCount; i++) {
667 chunks[i] = readU16();
668 }
669 size = 0;
670 } else {
671 // GZIP is not supported, use DZIP
672 return false;
673 }
674 // Skip optional file name
675 if ( flg & FNAME ) {
676 while (readU8() != 0 )
677 headerLength++;
678 headerLength++;
679 }
680 // Skip optional file comment
681 if ( flg & FCOMMENT ) {
682 while (readU8() != 0)
683 headerLength++;
684 headerLength++;
685 }
686 // Check optional header CRC
687 if ( flg & FHCRC ) {
688 int v = (int)crc.get() & 0xffff;
689 if (readU16() != v) {
690 // CRC failed
691 error = true;
692 }
693 headerLength += 2;
694 }
695
696 if ( chunkCount ) {
697 offsets = new unsigned int[ chunkCount ];
698 offsets[0] = headerLength;
699 size = chunks[0];
700 for ( unsigned i=1; i<chunkCount; i++ ) {
701 offsets[i] = offsets[i-1] + chunks[i-1];
702 size += chunks[i];
703 }
704 }
705
706 if ( fseek( f, headerLength, SEEK_SET ) ) {
707 return false;
708 }
709
710 if ( !readChunk( chunkCount-1 ) ) {
711 printf("Error reading chunk %d\n", chunkCount-1 );
712 return false;
713 }
714 size = unp_buffer_start + unp_buffer_len;
715
716 compact();
717 return true;
718 }
719
read(const TinyDictWord * w)720 const char * TinyDictDataFile::read( const TinyDictWord * w )
721 {
722 if ( !f || !w || w->getStart() + w->getSize() > size ) {
723 printf("article is out of file range (%d)\n", (int)size);
724 return NULL;
725 }
726
727 reserve( w->getSize() + 1 );
728 if ( !compressed ) {
729 // uncompressed
730 printf("reading uncompressed article\n");
731 if ( fseek( f, w->getStart(), SEEK_SET ) )
732 return NULL;
733 if ( fread( buf, 1, w->getSize(), f ) != w->getSize() )
734 return NULL;
735 } else {
736 // compressed
737 printf("reading compressed article\n");
738 if ( !zstream.read( (unsigned char*)buf, w->getStart(), w->getSize() ) )
739 return NULL;
740 }
741 buf[ w->getSize() ] = 0;
742 return buf;
743 }
744
open(const char * filename)745 bool TinyDictDataFile::open( const char * filename )
746 {
747 close();
748 if ( filename )
749 setFilename( filename );
750 if ( !fname )
751 return false;
752 f = fopen( fname, "rb" );
753 if ( !f )
754 return false;
755 if ( fseek( f, 0, SEEK_END ) ) {
756 close();
757 return false;
758 }
759 size = ftell( f );
760 if ( fseek( f, 0, SEEK_SET ) ) {
761 close();
762 return false;
763 }
764
765
766 unsigned char header[10];
767 if ( fread( header, 1, sizeof(header), f )!=sizeof(header) ) {
768 close();
769 return false;
770 }
771
772 if ( header[0]!=0x1f || header[1]!=0x8b ) { // 0x1F 0x8B -- GZIP magic
773 compressed = false;
774 printf("data file %s is not compressed\n", filename);
775 return true;
776 }
777
778 printf("data file %s is compressed\n", filename);
779 compressed = true;
780 if ( !zstream.open( f ) ) {
781 printf("data file %s opening error\n", filename);
782 close();
783 return false;
784 }
785 size = zstream.getSize();
786 return true;
787 }
788
TinyDictionary()789 TinyDictionary::TinyDictionary()
790 {
791 name = NULL;
792 data = new TinyDictDataFile();
793 index = new TinyDictIndexFile();
794 }
795
~TinyDictionary()796 TinyDictionary::~TinyDictionary()
797 {
798 delete data;
799 delete index;
800 if ( name )
801 free( name );
802 }
803
compact()804 void TinyDictionary::compact()
805 {
806 index->compact();
807 data->compact();
808 }
809
getDictionaryName()810 const char * TinyDictionary::getDictionaryName()
811 {
812 return name;
813 }
814
open(const char * indexfile,const char * datafile)815 bool TinyDictionary::open( const char * indexfile, const char * datafile )
816 {
817 // use index file name w/o path and extension as dictionary name
818 int lastSlash = -1;
819 int lastPoint = -1;
820 for ( int i=0; indexfile[i]; i++ ) {
821 if ( indexfile[i]=='/' || indexfile[i]=='\\' )
822 lastSlash = i;
823 else if ( indexfile[i]=='.' )
824 lastPoint = i;
825 }
826 if ( lastPoint>=0 && lastPoint>lastSlash ) {
827 name = strdup( indexfile + lastSlash + 1 );
828 name[ lastPoint - lastSlash - 1 ] = 0;
829 }
830 return index->open( indexfile ) && data->open( datafile );
831 }
832
833 /// returns word list's dictionary name
getDictionaryName()834 const char * TinyDictWordList::getDictionaryName()
835 {
836 if ( !dict )
837 return NULL;
838 return dict->getDictionaryName();
839 }
840
841 /// returns article for word by index
getArticle(int index)842 const char * TinyDictWordList::getArticle( int index )
843 {
844 if ( !dict )
845 return NULL;
846 if ( index<0 || index>=count )
847 return NULL;
848 return dict->getData()->read( list[index] );
849 }
850
851 /// searches dictionary for specified word, caller is responsible for deleting of returned object
find(const char * prefix,int options)852 TinyDictWordList * TinyDictionary::find( const char * prefix, int options )
853 {
854 TinyDictWordList * list = new TinyDictWordList();
855 list->setDict( this );
856 if ( index->find( prefix, (TINY_DICT_OPTION_STARTS_WITH & options) == 0, *list ) && list->length()>0 )
857 return list;
858 delete list;
859 return NULL;
860 }
861
add(const char * indexfile,const char * datafile)862 bool TinyDictionaryList::add( const char * indexfile, const char * datafile )
863 {
864 TinyDictionary * p = new TinyDictionary();
865 if ( !p->open( indexfile, datafile ) ) {
866 delete p;
867 return false;
868 }
869 if ( count>=size ) {
870 size = size ? size * 2 : 32;
871 list = (TinyDictionary**)realloc( list, sizeof(TinyDictionary *) * size );
872 }
873 list[ count++ ] = p;
874 return true;
875 }
876
877 /// create empty list
TinyDictionaryList()878 TinyDictionaryList::TinyDictionaryList() : list(NULL), size(0), count(0) { }
879
880 /// remove all dictionaries from list
clear()881 void TinyDictionaryList::clear()
882 {
883 if ( list ) {
884 for ( int i=0; i<count; i++ )
885 delete list[i];
886 free( list );
887 list = NULL;
888 size = 0;
889 count = 0;
890 }
891 }
892
~TinyDictionaryList()893 TinyDictionaryList::~TinyDictionaryList()
894 {
895 clear();
896 }
897
898 /// search all dictionaries in list for specified pattern
find(TinyDictResultList & result,const char * prefix,int options)899 bool TinyDictionaryList::find( TinyDictResultList & result, const char * prefix, int options )
900 {
901 result.clear();
902 for ( int i=0; i<count; i++ ) {
903 TinyDictWordList * p = list[i]->find( prefix, options );
904 if ( p )
905 result.add( p );
906 }
907 return result.length() > 0;
908 }
909
910
911 /// remove all dictionaries from list
clear()912 void TinyDictResultList::clear()
913 {
914 if ( list ) {
915 for ( int i=0; i<count; i++ )
916 delete list[i];
917 free( list );
918 list = NULL;
919 size = 0;
920 count = 0;
921 }
922 }
923
924 /// create empty list
TinyDictResultList()925 TinyDictResultList::TinyDictResultList() : list(NULL), size(0), count(0) { }
926
927 /// destructor
~TinyDictResultList()928 TinyDictResultList::~TinyDictResultList()
929 {
930 clear();
931 }
932
933 /// add item to list
add(TinyDictWordList * p)934 void TinyDictResultList::add( TinyDictWordList * p )
935 {
936 if ( count>=size ) {
937 size = size ? size * 2 : 32;
938 list = (TinyDictWordList**)realloc( list, sizeof(TinyDictWordList *) * size );
939 }
940 list[ count++ ] = p;
941 }
942
943 #ifdef TEST_APP
main(int argc,const char ** argv)944 int main( int argc, const char * * argv )
945 {
946 TinyDictIndexFile index;
947 TinyDictDataFile data;
948 TinyDictDataFile zdata;
949 if ( !index.open("mueller7.index") ) {
950 printf("cannot open index file mueller7.index\n");
951 return -1;
952 }
953 if ( !data.open("mueller7.dict") ) {
954 printf("cannot open data file mueller7.dict\n");
955 return -1;
956 }
957 if ( !zdata.open("mueller7.dict.dz") ) {
958 printf("cannot open data file mueller7.dict.dz\n");
959 return -1;
960 }
961 TinyDictWordList words;
962 const char * pattern = "full";
963 index.find( pattern, true, words );
964 printf( "%d words matched pattern %s\n", words.length(), pattern );
965 for ( int i=0; i<words.length(); i++ ) {
966 TinyDictWord * p = words.get(i);
967 printf("%s %d %d\n", p->getWord(), p->getStart(), p->getSize() );
968 const char * text = zdata.read( p );
969 if ( text )
970 printf( "article:\n%s\n", text );
971 else
972 printf( "cannot read article\n" );
973 }
974
975 {
976 // create TinyDictionaryList object
977 TinyDictionaryList dicts;
978 // register dictionaries using
979 dicts.add( "mueller7.index", "mueller7.dict.dz" );
980
981 // container for results
982 TinyDictResultList results;
983 dicts.find(results, "empty", 0 ); // find exact match
984
985 // for each source dictionary that matches pattern
986 for ( int d = 0; d<results.length(); d++ ) {
987 TinyDictWordList * words = results.get(d);
988 printf("dict: %s\n", words->getDictionaryName() );
989 // for each found word
990 for ( int i=0; i<words->length(); i++ ) {
991 TinyDictWord * word = words->get(i);
992 printf("word: %s\n", word->getWord() );
993 printf("article: %s\n", words->getArticle( i ) );
994 }
995 }
996 }
997 #ifdef _WIN32
998 printf("Press any key...");
999 getchar();
1000 #endif
1001 return 0;
1002 }
1003 #endif
1004