1 /** \file tinydict.cpp
2     \brief .dict dictionary file support implementation
3 
4     (c) Vadim Lopatin, 2009
5 
6     This source code is distributed under the terms of
7     GNU General Public License.
8 
9     See LICENSE file for details.
10 
11 */
12 
13 #include <stdlib.h>
14 #include "tinydict.h"
15 
16 
17 /// add word to list
add(TinyDictWord * word)18 void TinyDictWordList::add( TinyDictWord * word )
19 {
20     if ( count>=size ) {
21         size = size ? size * 2 : 32;
22         list = (TinyDictWord **)realloc( list, sizeof(TinyDictWord *) * size );
23     }
24     list[ count++ ] = word;
25 }
26 
27 /// clear list
clear()28 void TinyDictWordList::clear()
29 {
30     if ( list ) {
31         for ( int i=0; i<count; i++ )
32             delete list[i];
33         free( list );
34         list = NULL;
35         count = size = 0;
36     }
37 }
38 
39 /// empty list constructor
TinyDictWordList()40 TinyDictWordList::TinyDictWordList() : dict(NULL), list(NULL), size(0), count(0) { }
41 
42 /// destructor
~TinyDictWordList()43 TinyDictWordList::~TinyDictWordList() { clear(); }
44 
45 
46 ///
47 class TinyDictFileBase
48 {
49 protected:
50     char * fname;
51     FILE * f;
52     size_t size;
setFilename(const char * filename)53     void setFilename( const char * filename )
54     {
55         if ( fname )
56             free( fname );
57         if ( filename && *filename )
58             fname = strdup( filename );
59         else
60             fname = NULL;
61     }
62 public:
TinyDictFileBase()63     TinyDictFileBase() : fname(NULL), f(NULL), size(0)
64     {
65     }
~TinyDictFileBase()66     virtual ~TinyDictFileBase()
67     {
68         close();
69         setFilename( NULL );
70     }
close()71     virtual void close()
72     {
73         if (f)
74             fclose(f);
75         f = NULL;
76         size = 0;
77     }
78 };
79 
80 class TinyDictIndexFile : public TinyDictFileBase
81 {
82     int    factor;
83     int    count;
84     TinyDictWordList list;
85 public:
86 
compact()87 	void compact()
88 	{
89 		// do nothing
90 	}
91 
92     bool find( const char * prefix, bool exactMatch, TinyDictWordList & words );
93 
TinyDictIndexFile()94     TinyDictIndexFile() : factor( 16 ), count(0)
95     {
96     }
97 
~TinyDictIndexFile()98     virtual ~TinyDictIndexFile()
99     {
100     }
101 
102     bool open( const char * filename );
103 
104 };
105 
106 class TinyDictCRC
107 {
108     unsigned crc;
109 public:
reset()110     void reset()
111     {
112         crc = crc32( 0L, Z_NULL, 0 );
113     }
get()114     unsigned get()
115     {
116         return crc;
117     }
update(const void * data,unsigned size)118     unsigned update( const void * data, unsigned size )
119     {
120         crc = crc32( crc, (const unsigned char *)data, size );
121         return crc;
122     }
update(unsigned char b)123     unsigned update( unsigned char b )
124     {
125         return update( &b, sizeof(b) );
126     }
update(unsigned short b)127     unsigned update( unsigned short b )
128     {
129         return update( &b, sizeof(b) );
130     }
update(unsigned int b)131     unsigned update( unsigned int b )
132     {
133         return update( &b, sizeof(b) );
134     }
TinyDictCRC()135     TinyDictCRC()
136     {
137         reset();
138     }
139 };
140 
141 class TinyDictZStream
142 {
143     FILE * f;
144     TinyDictCRC crc;
145 
146     int type;
147     unsigned size;
148     unsigned txtpos;
149 
150     unsigned headerLength;
151     bool error;
152     unsigned short * chunks;
153     unsigned int * offsets;
154     unsigned extraLength;
155     unsigned char subfieldID1;
156     unsigned char subfieldID2;
157     unsigned subfieldLength;
158     unsigned subfieldVersion;
159     unsigned chunkLength;
160     unsigned chunkCount;
161 
162     bool     zInitialized;
163     z_stream zStream;
164     unsigned packed_size;
165     unsigned char * unp_buffer;
166     unsigned unp_buffer_start;
167     unsigned unp_buffer_len;
168     unsigned unp_buffer_size;
169 
readBytes(unsigned char * buf,unsigned size)170     unsigned int readBytes( unsigned char * buf, unsigned size )
171     {
172         if ( error || !f )
173             return 0;
174         unsigned int bytesRead = fread( buf, 1, size, f );
175         crc.update( buf, bytesRead );
176         return bytesRead;
177     }
178 
readU32()179     unsigned int readU32()
180     {
181         unsigned char buf[4];
182         if ( !error && f && fread( buf, 1, 4, f )==4 ) {
183             crc.update( buf, 4 );
184             return (((((((unsigned int)buf[3]) << 8) + buf[2]) << 8) + buf[1]) << 8 ) + buf[0];
185         }
186         error = true;
187         return 0;
188     }
189 
readU16()190     unsigned short readU16()
191     {
192         unsigned char buf[2];
193         if ( !error && f && fread( buf, 1, 2, f )==2 ) {
194             crc.update( buf, 2 );
195             return (((unsigned short)buf[1]) << 8) + buf[0];
196         }
197         error = true;
198         return 0;
199     }
200 
readU8()201     unsigned char readU8()
202     {
203         unsigned char buf[1];
204         if ( !error && f && fread( buf, 1, 1, f )==1 ) {
205             crc.update( buf, 1 );
206             return buf[0];
207         }
208         error = true;
209         return 0;
210     }
211 
212     bool zinit(unsigned char * next_in, unsigned avail_in, unsigned char * next_out, unsigned avail_out);
213 
214     bool zclose();
215 
216     bool readChunk( unsigned n );
217 
218 public:
219 	/// minimize memory consumption
220 	void compact();
221 	/// get unpacked data size
getSize()222     unsigned getSize() { return size; }
223 	/// create uninitialized stream
224     TinyDictZStream();
225 	/// open from file
226     bool open( FILE * file );
227 	/// read block of data
228     bool read( unsigned char * buf, unsigned start, unsigned len );
229 	/// close stream
230     ~TinyDictZStream();
231 };
232 
233 class TinyDictDataFile : public TinyDictFileBase
234 {
235     bool compressed;
236     char * buf;
237     int    buf_size;
238 
239     TinyDictZStream zstream;
240 
reserve(int sz)241     void reserve( int sz )
242     {
243         if ( buf_size < sz ) {
244             char * oldptr = buf;
245             buf = (char*) realloc( buf, sizeof(char) * sz );
246             if ( !buf) {
247                 free(oldptr);
248                 fprintf(stderr, "out of memory\n");
249                 exit(-2);
250             }
251             buf_size = sz;
252         }
253     }
254 
255 
256 public:
257 
compact()258 	void compact()
259 	{
260 		zstream.compact();
261 		if ( buf ) {
262 			free( buf );
263 			buf = NULL;
264 			buf_size = 0;
265 		}
266 	}
267 
268 	const char * read( const TinyDictWord * w );
269 
270     bool open( const char * filename );
271 
TinyDictDataFile()272     TinyDictDataFile() : compressed(false), buf(0), buf_size(0)
273     {
274     }
275 
~TinyDictDataFile()276     virtual ~TinyDictDataFile()
277     {
278         if ( buf )
279             free( buf );
280     }
281 };
282 
283 
284 static int base64table[128] = { 0 };
285 static const char * base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
286 
parseBase64(const char * str)287 static unsigned parseBase64( const char * str )
288 {
289     int i;
290     if ( !*base64table ) {
291         for ( i=0; i<128; i++ )
292             base64table[i] = -1;
293         for ( i=0; base64chars[i]; i++ )
294             base64table[(unsigned)base64chars[i]] = i;
295     }
296     unsigned n = 0;
297     for ( ; *str; str++ ) {
298         int code = base64table[ (unsigned)*str ];
299         if ( code<0 )
300             return (unsigned)-1;
301         n = ( n << 6 ) + code;
302     }
303     return n;
304 }
305 
compare(const char * str) const306 int TinyDictWord::compare( const char * str ) const
307 {
308     return strcmp( word, str );
309 }
310 
my_fgets(char * buf,int size,FILE * f)311 static int my_fgets( char * buf, int size, FILE * f )
312 {
313 	int i=0;
314 	for ( ; i<size; i++ ) {
315 		int ch = fgetc( f );
316 		if ( ch == '\n' )
317 			break;
318 		if ( ch > 0 )
319 			buf[i] = (char) ch;
320 		else
321 			break;
322 	}
323 	buf[i] = 0;
324 	return i;
325 }
326 
327 /// factory - reading from index file
read(FILE * f,unsigned index)328 TinyDictWord * TinyDictWord::read( FILE * f, unsigned index )
329 {
330     if ( !f || feof(f) )
331         return NULL;
332     char buf[1024];
333     unsigned indexpos = ftell( f );
334     int sz = my_fgets( buf, 1023, f );
335 	if ( !sz )
336         return NULL;
337     int tabc = 0;
338     int tabs[2];
339     for ( int i=0; buf[i]; i++ ) {
340         if ( buf[i] == '\t' && tabc<2 )
341             tabs[tabc++] = i;
342     }
343     if ( tabc!=2 )
344         return NULL;
345     const char * word = buf;
346     const char * pos_str = buf + tabs[0] + 1;
347     const char * len_str = buf + tabs[1] + 1;
348     buf[tabs[0]] = 0;
349     buf[tabs[1]] = 0;
350     unsigned start = parseBase64( pos_str );
351     unsigned len = parseBase64( len_str );
352     if ( start==(unsigned)-1 || len==(unsigned)-1 )
353         return NULL;
354     return new TinyDictWord( index, indexpos, start, len, word );
355 }
356 
find(const char * prefix)357 int TinyDictWordList::find( const char * prefix )
358 {
359     if ( !count )
360         return -1;
361     int a = 0;
362     int b = count;
363     for ( ;a < b-1; ) {
364         int c = (a + b) / 2;
365         int res = list[c]->compare( prefix );
366         if ( !res )
367             return c;
368         if ( res < 0 ) {
369             a = c + 1;
370         } else {
371             b = c;
372         }
373     }
374     if ( a==0 || list[a]->compare( prefix )<0 )
375         return a;
376     return a - 1;
377 }
378 
match(const char * str,bool exact) const379 bool TinyDictWord::match( const char * str, bool exact ) const
380 {
381     if ( exact )
382         return !strcmp( word, str );
383     int i=0;
384     for ( ; str[i]; i++  ) {
385         if ( str[i] != word[i] )
386             return false;
387     }
388     return str[i]==0;
389 }
390 
find(const char * prefix,bool exactMatch,TinyDictWordList & words)391 bool TinyDictIndexFile::find( const char * prefix, bool exactMatch, TinyDictWordList & words )
392 {
393     words.clear();
394     int n = list.find( prefix );
395     if ( n<0 )
396         return false;
397     TinyDictWord * p = list.get( n );
398     if ( fseek( f, p->getIndexPos(), SEEK_SET ) )
399         return false;
400     int index = p->getIndex();
401     for ( ;; ) {
402         p = TinyDictWord::read( f, index++ );
403         if ( !p )
404             break;
405         int res = p->compare( prefix );
406         if ( p->match( prefix, exactMatch ) )
407             words.add( p );
408 		else {
409             delete p;
410 			if ( res > 0 )
411 	            break;
412         }
413     }
414     return true;
415 }
416 
open(const char * filename)417 bool TinyDictIndexFile::open( const char * filename )
418 {
419     close();
420     if ( filename )
421         setFilename( filename );
422     if ( !fname )
423         return false;
424     f = fopen( fname, "rb" );
425     if ( !f )
426         return false;
427     if ( fseek( f, 0, SEEK_END ) ) {
428         close();
429         return false;
430     }
431     size = ftell( f );
432     if ( fseek( f, 0, SEEK_SET ) ) {
433         close();
434         return false;
435     }
436     // test
437     TinyDictWord * p;
438     count = 0;
439     for ( ;; count++ ) {
440         p = TinyDictWord::read( f, count );
441         if ( !p )
442             break;
443         if ( (count % factor) == 0 ) {
444             list.add( p );
445         } else {
446             delete p;
447         }
448     }
449     printf("%d words read from index\n", count);
450     return true;
451 }
452 
453 enum {
454     DICT_TEXT,
455     DICT_GZIP,
456     DICT_DZIP
457 };
458 
zinit(unsigned char * next_in,unsigned avail_in,unsigned char * next_out,unsigned avail_out)459 bool TinyDictZStream::zinit( unsigned char * next_in, unsigned avail_in, unsigned char * next_out, unsigned avail_out )
460 {
461     zclose();
462     if ( !zInitialized ) {
463         zStream.zalloc    = NULL;
464         zStream.zfree     = NULL;
465         zStream.opaque    = NULL;
466         zStream.next_in   = next_in;
467         zStream.avail_in  = avail_in;
468         zStream.next_out  = next_out;
469         zStream.avail_out = avail_out;
470         if (inflateInit2( &zStream, -15 ) != Z_OK ) {
471             // zlib initialization failed
472             return false;
473         }
474         zInitialized = true;
475     }
476 	return true;
477 }
478 
zclose()479 bool TinyDictZStream::zclose()
480 {
481     if ( zInitialized ) {
482         inflateEnd( &zStream );
483         zInitialized = false;
484     }
485     return true;
486 }
487 
compact()488 void TinyDictZStream::compact()
489 {
490 	if ( unp_buffer ) {
491 		free( unp_buffer );
492 		unp_buffer_start = unp_buffer_len = unp_buffer_size = 0;
493 		unp_buffer = NULL;
494 	}
495 }
496 
readChunk(unsigned n)497 bool TinyDictZStream::readChunk( unsigned n )
498 {
499     if ( n >= chunkCount )
500         return false;
501     if ( !unp_buffer ) {
502         unp_buffer = (unsigned char *)malloc( sizeof(unsigned char)*chunkLength );
503         unp_buffer_size = chunkLength;
504     }
505     unp_buffer_start = n * chunkLength;
506 
507     if ( fseek( f, offsets[ n ], SEEK_SET ) ) {
508         printf( "cannot seek to %d position\n", offsets[n] );
509         return false;
510     }
511     unsigned packsz = chunks[n];
512     unsigned char * tmp = (unsigned char *)malloc( sizeof(unsigned char) * packsz );
513 
514     crc.reset();
515     unsigned int bytesRead = readBytes( tmp, packsz );
516     unsigned crc1 = crc.get();
517     unsigned crc2 = readU32();
518     if ( bytesRead != packsz || error ) {
519         printf( "error reading packed data\n" );
520         free( tmp );
521         return false;
522     }
523     if ( crc1!=crc2  ) {
524         printf( "CRC error: real: %08x expected: %08x\n", crc1, crc2 );
525         //free( tmp );
526         //return false;
527     }
528     zclose();
529     if ( !zinit(tmp, packsz, unp_buffer, unp_buffer_size) ) {
530         printf("cannot init deflater\n");
531         return false;
532     }
533     printf("unpacking %d bytes\n", packsz);
534     int err = inflate( &zStream,  Z_PARTIAL_FLUSH );
535     printf("inflate result: %d\n", err);
536     if ( err != Z_OK ) {
537         printf("Inflate error %s (%d). avail_in=%d, avail_out=%d \n", zStream.msg, err, (int)zStream.avail_in, (int)zStream.avail_out);
538         free( tmp );
539         return false;
540     }
541     if ( zStream.avail_in ) {
542         printf("Inflate: not all data read, still %d bytes available\n", (int)zStream.avail_in );
543         free( tmp );
544         return false;
545     }
546     unp_buffer_len = unp_buffer_size - zStream.avail_out;
547 
548     printf("freeing tmp\n");
549     free( tmp );
550     printf("done\n");
551 
552 
553 
554     if ( n < chunkCount-1 && unp_buffer_len!=chunkLength ) {
555         printf("wrong chunk length\n");
556         return false; // too short chunk data
557     }
558 
559 
560     zclose();
561     return true;
562 }
563 
read(unsigned char * buf,unsigned start,unsigned len)564 bool TinyDictZStream::read( unsigned char * buf, unsigned start, unsigned len )
565 {
566     if ( start >= unp_buffer_start && start < unp_buffer_start + unp_buffer_len ) {
567         unsigned readyBytes = unp_buffer_len - (start-unp_buffer_start);
568         if ( readyBytes > len )
569             readyBytes = len;
570         memcpy( buf, unp_buffer + (start-unp_buffer_start), readyBytes );
571         buf += readyBytes;
572         start += readyBytes;
573         len -= readyBytes;
574         if ( !len )
575             return true;
576     }
577     unsigned n = start / chunkLength;
578     if ( !readChunk( n ) )
579         return false;
580     unsigned readyBytes = unp_buffer_len - (start-unp_buffer_start);
581     if ( readyBytes > len )
582         readyBytes = len;
583     memcpy( buf, unp_buffer + (start-unp_buffer_start), readyBytes );
584     buf += readyBytes;
585     start += readyBytes;
586     len -= readyBytes;
587     if ( !len )
588         return true;
589     return false;
590 }
591 
TinyDictZStream()592 TinyDictZStream::TinyDictZStream()
593 : f ( NULL ), size( 0 ), txtpos(0)
594 , headerLength(0), error( false )
595 , chunks(NULL), offsets(NULL), chunkLength(0), chunkCount(0)
596 , zInitialized(false), packed_size(0), unp_buffer(NULL), unp_buffer_start(0), unp_buffer_len(0), unp_buffer_size(0)
597 {
598     memset( &zStream, 0, sizeof(zStream) );
599 }
600 
~TinyDictZStream()601 TinyDictZStream::~TinyDictZStream()
602 {
603     zclose();
604     if ( unp_buffer )
605         free( unp_buffer );
606     if ( chunks )
607         delete [] chunks;
608     if ( offsets )
609         delete [] offsets;
610 }
611 
open(FILE * file)612 bool TinyDictZStream::open( FILE * file )
613 {
614     f = file;
615     error = false;
616     if ( fseek( f, 0, SEEK_END ) ) {
617         return false;
618     }
619     packed_size = ftell( f );
620     if ( fseek( f, 0, SEEK_SET ) ) {
621         return false;
622     }
623 
624     crc.reset();
625     unsigned char header[10];
626     if ( fread( header, 1, sizeof(header), f )!=sizeof(header) ) {
627         return false;
628     }
629     crc.update( header, sizeof(header) );
630     if ( header[0]!=0x1f || header[1]!=0x8b ) { // 0x1F 0x8B -- GZIP magic
631         type = DICT_TEXT;
632         return true;
633     }
634     if ( header[2]!=8 ) {
635         // unknown compression method
636         return false;
637     }
638     unsigned char flg = header[3];
639     headerLength = 10;
640 
641     //const char FTEXT   = 1;    // Extra text
642     const char FHCRC   = 2;    // Header CRC
643     const char FEXTRA  = 4;    // Extra field
644     const char FNAME   = 8;    // File name
645     const char FCOMMENT = 16;   // File comment
646 
647     type = DICT_GZIP;
648 
649     packed_size = size;
650 
651     // Optional extra field
652     if ( flg & FEXTRA ) {
653         type = DICT_DZIP;
654         extraLength = readU16();
655         headerLength += extraLength + 2;
656         subfieldID1 = readU8();
657         subfieldID2 = readU8();
658         subfieldLength = readU16(); // 2 bytes subfield length
659         subfieldVersion = readU16(); // 2 bytes subfield version
660         chunkLength = readU16(); // 2 bytes chunk length
661         chunkCount = readU16(); // 2 bytes chunk count
662         if ( error ) {
663             return false;
664         }
665         chunks = new unsigned short[ chunkCount ];
666         for (unsigned i=0; i<chunkCount; i++) {
667             chunks[i] = readU16();
668         }
669         size = 0;
670     } else {
671         // GZIP is not supported, use DZIP
672         return false;
673     }
674     // Skip optional file name
675     if ( flg & FNAME ) {
676         while (readU8() != 0 )
677             headerLength++;
678         headerLength++;
679     }
680     // Skip optional file comment
681     if ( flg & FCOMMENT ) {
682         while (readU8() != 0)
683             headerLength++;
684         headerLength++;
685     }
686     // Check optional header CRC
687     if ( flg & FHCRC ) {
688         int v = (int)crc.get() & 0xffff;
689         if (readU16() != v) {
690             // CRC failed
691             error = true;
692         }
693         headerLength += 2;
694     }
695 
696     if ( chunkCount ) {
697         offsets = new unsigned int[ chunkCount ];
698         offsets[0] = headerLength;
699         size = chunks[0];
700         for ( unsigned i=1; i<chunkCount; i++ ) {
701             offsets[i] = offsets[i-1] + chunks[i-1];
702             size += chunks[i];
703         }
704     }
705 
706     if ( fseek( f, headerLength, SEEK_SET ) ) {
707         return false;
708     }
709 
710     if ( !readChunk( chunkCount-1 ) ) {
711         printf("Error reading chunk %d\n", chunkCount-1 );
712         return false;
713     }
714     size = unp_buffer_start + unp_buffer_len;
715 
716 	compact();
717     return true;
718 }
719 
read(const TinyDictWord * w)720 const char * TinyDictDataFile::read( const TinyDictWord * w )
721 {
722     if ( !f || !w || w->getStart() + w->getSize() > size ) {
723         printf("article is out of file range (%d)\n", (int)size);
724         return NULL;
725     }
726 
727     reserve( w->getSize() + 1 );
728     if ( !compressed ) {
729         // uncompressed
730         printf("reading uncompressed article\n");
731         if ( fseek( f, w->getStart(), SEEK_SET ) )
732             return NULL;
733         if ( fread( buf, 1, w->getSize(), f ) != w->getSize() )
734             return NULL;
735     } else {
736         // compressed
737         printf("reading compressed article\n");
738         if ( !zstream.read( (unsigned char*)buf, w->getStart(), w->getSize() ) )
739             return NULL;
740     }
741     buf[ w->getSize() ] = 0;
742     return buf;
743 }
744 
open(const char * filename)745 bool TinyDictDataFile::open( const char * filename )
746 {
747     close();
748     if ( filename )
749         setFilename( filename );
750     if ( !fname )
751         return false;
752     f = fopen( fname, "rb" );
753     if ( !f )
754         return false;
755     if ( fseek( f, 0, SEEK_END ) ) {
756         close();
757         return false;
758     }
759     size = ftell( f );
760     if ( fseek( f, 0, SEEK_SET ) ) {
761         close();
762         return false;
763     }
764 
765 
766     unsigned char header[10];
767     if ( fread( header, 1, sizeof(header), f )!=sizeof(header) ) {
768         close();
769         return false;
770     }
771 
772     if ( header[0]!=0x1f || header[1]!=0x8b ) { // 0x1F 0x8B -- GZIP magic
773         compressed = false;
774         printf("data file %s is not compressed\n", filename);
775         return true;
776     }
777 
778     printf("data file %s is compressed\n", filename);
779     compressed = true;
780     if ( !zstream.open( f ) ) {
781         printf("data file %s opening error\n", filename);
782         close();
783         return false;
784     }
785     size = zstream.getSize();
786     return true;
787 }
788 
TinyDictionary()789 TinyDictionary::TinyDictionary()
790 {
791 	name = NULL;
792 	data = new TinyDictDataFile();
793 	index = new TinyDictIndexFile();
794 }
795 
~TinyDictionary()796 TinyDictionary::~TinyDictionary()
797 {
798 	delete data;
799 	delete index;
800 	if ( name )
801 		free( name );
802 }
803 
compact()804 void TinyDictionary::compact()
805 {
806 	index->compact();
807 	data->compact();
808 }
809 
getDictionaryName()810 const char * TinyDictionary::getDictionaryName()
811 {
812 	return name;
813 }
814 
open(const char * indexfile,const char * datafile)815 bool TinyDictionary::open( const char * indexfile, const char * datafile )
816 {
817 	// use index file name w/o path and extension as dictionary name
818 	int lastSlash = -1;
819 	int lastPoint = -1;
820 	for ( int i=0; indexfile[i]; i++ ) {
821 		if ( indexfile[i]=='/' || indexfile[i]=='\\' )
822 			lastSlash = i;
823 		else if ( indexfile[i]=='.' )
824 			lastPoint = i;
825 	}
826 	if ( lastPoint>=0 && lastPoint>lastSlash ) {
827 		name = strdup( indexfile + lastSlash + 1 );
828 		name[ lastPoint - lastSlash - 1 ] = 0;
829 	}
830 	return index->open( indexfile ) && data->open( datafile );
831 }
832 
833 /// returns word list's dictionary name
getDictionaryName()834 const char * TinyDictWordList::getDictionaryName()
835 {
836 	if ( !dict )
837 		return NULL;
838 	return dict->getDictionaryName();
839 }
840 
841 /// returns article for word by index
getArticle(int index)842 const char * TinyDictWordList::getArticle( int index )
843 {
844 	if ( !dict )
845 		return NULL;
846 	if ( index<0 || index>=count )
847 		return NULL;
848 	return dict->getData()->read( list[index] );
849 }
850 
851 /// searches dictionary for specified word, caller is responsible for deleting of returned object
find(const char * prefix,int options)852 TinyDictWordList * TinyDictionary::find( const char * prefix, int options )
853 {
854 	TinyDictWordList * list = new TinyDictWordList();
855 	list->setDict( this );
856 	if ( index->find( prefix, (TINY_DICT_OPTION_STARTS_WITH & options) == 0, *list ) && list->length()>0 )
857 		return list;
858 	delete list;
859 	return NULL;
860 }
861 
add(const char * indexfile,const char * datafile)862 bool TinyDictionaryList::add( const char * indexfile, const char * datafile )
863 {
864 	TinyDictionary * p = new TinyDictionary();
865 	if ( !p->open( indexfile, datafile ) ) {
866 		delete p;
867 		return false;
868 	}
869     if ( count>=size ) {
870         size = size ? size * 2 : 32;
871         list = (TinyDictionary**)realloc( list, sizeof(TinyDictionary *) * size );
872     }
873     list[ count++ ] = p;
874     return true;
875 }
876 
877 /// create empty list
TinyDictionaryList()878 TinyDictionaryList::TinyDictionaryList() : list(NULL), size(0), count(0) { }
879 
880 /// remove all dictionaries from list
clear()881 void TinyDictionaryList::clear()
882 {
883 	if ( list ) {
884 		for ( int i=0; i<count; i++ )
885 			delete list[i];
886 		free( list );
887 		list = NULL;
888 		size = 0;
889 		count = 0;
890 	}
891 }
892 
~TinyDictionaryList()893 TinyDictionaryList::~TinyDictionaryList()
894 {
895 	clear();
896 }
897 
898 /// search all dictionaries in list for specified pattern
find(TinyDictResultList & result,const char * prefix,int options)899 bool TinyDictionaryList::find( TinyDictResultList & result, const char * prefix, int options )
900 {
901 	result.clear();
902 	for ( int i=0; i<count; i++ ) {
903 		TinyDictWordList * p = list[i]->find( prefix, options );
904 		if ( p )
905 			result.add( p );
906 	}
907 	return result.length() > 0;
908 }
909 
910 
911 /// remove all dictionaries from list
clear()912 void TinyDictResultList::clear()
913 {
914 	if ( list ) {
915 		for ( int i=0; i<count; i++ )
916 			delete list[i];
917 		free( list );
918 		list = NULL;
919 		size = 0;
920 		count = 0;
921 	}
922 }
923 
924 /// create empty list
TinyDictResultList()925 TinyDictResultList::TinyDictResultList() : list(NULL), size(0), count(0) { }
926 
927 /// destructor
~TinyDictResultList()928 TinyDictResultList::~TinyDictResultList()
929 {
930 	clear();
931 }
932 
933 /// add item to list
add(TinyDictWordList * p)934 void TinyDictResultList::add( TinyDictWordList * p )
935 {
936     if ( count>=size ) {
937         size = size ? size * 2 : 32;
938         list = (TinyDictWordList**)realloc( list, sizeof(TinyDictWordList *) * size );
939     }
940     list[ count++ ] = p;
941 }
942 
943 #ifdef TEST_APP
main(int argc,const char ** argv)944 int main( int argc, const char * * argv )
945 {
946     TinyDictIndexFile index;
947     TinyDictDataFile data;
948     TinyDictDataFile zdata;
949     if ( !index.open("mueller7.index") ) {
950         printf("cannot open index file mueller7.index\n");
951         return -1;
952     }
953     if ( !data.open("mueller7.dict") ) {
954         printf("cannot open data file mueller7.dict\n");
955         return -1;
956     }
957     if ( !zdata.open("mueller7.dict.dz") ) {
958         printf("cannot open data file mueller7.dict.dz\n");
959         return -1;
960     }
961     TinyDictWordList words;
962     const char * pattern = "full";
963     index.find( pattern, true, words );
964     printf( "%d words matched pattern %s\n", words.length(), pattern );
965     for ( int i=0; i<words.length(); i++ ) {
966         TinyDictWord * p = words.get(i);
967         printf("%s %d %d\n", p->getWord(), p->getStart(), p->getSize() );
968         const char * text = zdata.read( p );
969         if ( text )
970             printf( "article:\n%s\n", text );
971         else
972             printf( "cannot read article\n" );
973     }
974 
975 	{
976 		// create TinyDictionaryList object
977 		TinyDictionaryList dicts;
978 		// register dictionaries using
979 		dicts.add( "mueller7.index", "mueller7.dict.dz" );
980 
981 		// container for results
982 		TinyDictResultList results;
983 	    dicts.find(results, "empty", 0 ); // find exact match
984 
985 		// for each source dictionary that matches pattern
986 		for ( int d = 0; d<results.length(); d++ ) {
987 			TinyDictWordList * words = results.get(d);
988 			printf("dict: %s\n", words->getDictionaryName() );
989 			// for each found word
990 			for ( int i=0; i<words->length(); i++ ) {
991 				TinyDictWord * word = words->get(i);
992 				printf("word: %s\n", word->getWord() );
993 				printf("article: %s\n", words->getArticle( i ) );
994 			}
995 		}
996 	}
997 #ifdef _WIN32
998 	printf("Press any key...");
999 	getchar();
1000 #endif
1001     return 0;
1002 }
1003 #endif
1004