1 // This file is part of The New Aspell
2 // Copyright (C) 2000-2001,2011 by Kevin Atkinson under the GNU LGPL
3 // license version 2.0 or 2.1.  You should have received a copy of the
4 // LGPL license along with this library if you did not you can find it
5 // at http://www.gnu.org/.
6 
7 // Aspell's main word list is laid out as follows:
8 //
9 // * header
10 // * jump table for editdist 1
11 // * jump table for editdist 2
12 // * data block
13 // * hash table
14 
15 // data block laid out as follows:
16 //
17 // Words:
18 //   (<8 bit frequency><8 bit: flags><8 bit: offset to next word>
19 //      <8 bit: word size><word><null>
20 //      [<affix info><null>][<category info><null>])+
21 // Words with soundslike:
22 //   (<8 bit: offset to next item><8 bit: soundslike size><soundslike>
23 //      <words with that soundlike>)+
24 // Flags are mapped as follows:
25 //   bits 0-3: word info
26 //   bit    4: duplicate flag
27 //   bit    5: <unused>
28 //   bit    6: have affix info
29 //   bit    7: have compound info
30 
31 #include <utility>
32 using std::pair;
33 
34 #include <string.h>
35 #include <stdio.h>
36 //#include <errno.h>
37 
38 #include "settings.h"
39 
40 #include "block_vector.hpp"
41 #include "config.hpp"
42 #include "data.hpp"
43 #include "data_util.hpp"
44 #include "errors.hpp"
45 #include "file_util.hpp"
46 #include "fstream.hpp"
47 #include "language.hpp"
48 #include "stack_ptr.hpp"
49 #include "objstack.hpp"
50 #include "vector.hpp"
51 #include "vector_hash-t.hpp"
52 #include "check_list.hpp"
53 #include "lsort.hpp"
54 
55 #include "iostream.hpp"
56 
57 #include "gettext.h"
58 
59 typedef unsigned int   u32int;
60 static const u32int u32int_max = (u32int)-1;
61 typedef unsigned short u16int;
62 typedef unsigned char byte;
63 
64 #ifdef USE_32_BIT_HASH_FUN
65 typedef u32int hash_int_t;
66 #else
67 typedef size_t hash_int_t;
68 #endif
69 
70 #ifdef HAVE_MMAP
71 
72 // POSIX headers
73 #include <fcntl.h>
74 #include <unistd.h>
75 #include <sys/mman.h>
76 
77 #endif
78 
79 #ifndef MAP_FAILED
80 #define MAP_FAILED (-1)
81 #endif
82 
83 #ifdef HAVE_MMAP
84 
mmap_open(unsigned int block_size,FStream & f,unsigned int offset)85 static inline char * mmap_open(unsigned int block_size,
86 			       FStream & f,
87 			       unsigned int offset)
88 {
89   f.flush();
90   int fd = f.file_no();
91   return static_cast<char *>
92     (mmap(NULL, block_size, PROT_READ, MAP_SHARED, fd, offset));
93 }
94 
mmap_free(char * block,unsigned int size)95 static inline void mmap_free(char * block, unsigned int size)
96 {
97   munmap(block, size);
98 }
99 
100 #else
101 
mmap_open(unsigned int,FStream & f,unsigned int)102 static inline char * mmap_open(unsigned int,
103 			       FStream & f,
104 			       unsigned int)
105 {
106   return reinterpret_cast<char *>(MAP_FAILED);
107 }
108 
mmap_free(char *,unsigned int)109 static inline void mmap_free(char *, unsigned int)
110 {
111   abort();
112 }
113 
114 #endif
115 
116 static byte HAVE_AFFIX_FLAG = 1 << 7;
117 static byte HAVE_CATEGORY_FLAG = 1 << 6;
118 
119 static byte DUPLICATE_FLAG = 1 << 4;
120 // this flag is set when there is is more than one word for a
121 // particulear "clean" word such as "jello" "Jello".  It is set on all
122 // but the last word of the group.  Ie, if it is set than the next
123 // word when converted to its "clean" form equals the same value.
124 
125 static byte WORD_INFO_MASK = 0x0F;
126 
127 static const int FREQUENCY_INFO_O = 4;
128 static const int FLAGS_O = 3;
129 static const int NEXT_O = 2;
130 static const int WORD_SIZE_O = 1;
131 
get_word_size(const char * d)132 static inline int get_word_size(const char * d) {
133   return *reinterpret_cast<const byte *>(d - WORD_SIZE_O);
134 }
135 
get_flags(const char * d)136 static inline byte get_flags(const char * d) {
137   return *reinterpret_cast<const byte *>(d - FLAGS_O);
138 }
139 
get_offset(const char * d)140 static inline byte get_offset(const char * d) {
141   return *reinterpret_cast<const byte *>(d - NEXT_O);
142 }
143 
get_next(const char * d)144 static inline const char * get_next(const char * d) {
145   return d + *reinterpret_cast<const byte *>(d - NEXT_O);
146 }
147 
get_sl_words_begin(const char * d)148 static inline const char * get_sl_words_begin(const char * d) {
149   return d + *reinterpret_cast<const byte *>(d - WORD_SIZE_O) + 4;
150   // FIXME: This isn't right when frequency info is stored in the table
151 }
152 
153 // get_next might go past the end so don't JUST compare
154 // for equality.  Ie use while (cur < end) not (cur != end)
get_sl_words_end(const char * d)155 static inline const char * get_sl_words_end(const char * d) {
156   return get_next(d) - 3;
157 }
158 
get_affix(const char * d)159 static inline const char * get_affix(const char * d) {
160   int word_size = get_word_size(d);
161   if (get_flags(d) & HAVE_AFFIX_FLAG)
162     return d + word_size +  1;
163   else
164     return d + word_size;
165 }
166 
get_category(const char * d)167 static inline const char * get_category(const char * d) {
168   int word_size = get_word_size(d);
169   if (get_flags(d) & (HAVE_AFFIX_FLAG | HAVE_CATEGORY_FLAG))
170     return d + strlen(d + word_size +  1) + 1;
171   else if (get_flags(d) & HAVE_CATEGORY_FLAG)
172     return d + word_size +  1;
173   else
174     return d + word_size;
175 }
176 
duplicate_flag(const char * d)177 static inline bool duplicate_flag(const char * d) {
178   return get_flags(d) & DUPLICATE_FLAG;
179 }
180 
181 namespace {
182 
183   using namespace aspeller;
184 
185   /////////////////////////////////////////////////////////////////////
186   //
187   //  ReadOnlyDict
188   //
189 
190   struct Jump
191   {
192     char   sl[4];
193     u32int loc;
Jump__anon395da9830111::Jump194     Jump() {memset(this, 0, sizeof(Jump));}
195   };
196 
197   class ReadOnlyDict : public Dictionary
198   {
199 
200   public: //but don't use
201 
202     struct WordLookupParms {
203       const char * block_begin;
WordLookupParms__anon395da9830111::ReadOnlyDict::WordLookupParms204       WordLookupParms() {}
205       typedef BlockVector<const u32int> Vector;
206       typedef u32int                    Value;
207       typedef const char *              Key;
208       static const bool is_multi = false;
key__anon395da9830111::ReadOnlyDict::WordLookupParms209       Key key(Value v) const {return block_begin + v;}
210       InsensitiveHash<hash_int_t> hash;
211       InsensitiveEqual equal;
is_nonexistent__anon395da9830111::ReadOnlyDict::WordLookupParms212       bool is_nonexistent(Value v) const {return v == u32int_max;}
make_nonexistent__anon395da9830111::ReadOnlyDict::WordLookupParms213       void make_nonexistent(const Value & v) const {abort();}
214     };
215     typedef VectorHashTable<WordLookupParms> WordLookup;
216 
217   public: // but don't use
218 
219     char *           block;
220     u32int           block_size;
221     char *           mmaped_block;
222     u32int           mmaped_size;
223     const Jump * jump1;
224     const Jump * jump2;
225     WordLookup       word_lookup;
226     const char *     word_block;
227     const char *     first_word;
228 
229     ReadOnlyDict(const ReadOnlyDict&);
230     ReadOnlyDict& operator= (const ReadOnlyDict&);
231 
232     struct Elements;
233     struct SoundslikeElements;
234 
235   public:
236     WordEntryEnumeration * detailed_elements() const;
237     Size      size()     const;
238     bool      empty()    const;
239 
ReadOnlyDict()240     ReadOnlyDict()
241       : Dictionary(basic_dict, "ReadOnlyDict")
242     {
243       block = 0;
244     }
245 
~ReadOnlyDict()246     ~ReadOnlyDict() {
247       if (block != 0) {
248 	if (mmaped_block)
249 	  mmap_free(mmaped_block, mmaped_size);
250 	else
251 	  free(block);
252       }
253     }
254 
255     PosibErr<void> load(ParmString, Config &, DictList *, SpellerImpl *);
256     PosibErr<void> check_hash_fun() const;
257     void low_level_dump() const;
258 
259     bool lookup(ParmString word, const SensitiveCompare *, WordEntry &) const;
260 
261     bool clean_lookup(ParmString, WordEntry &) const;
262 
263     bool soundslike_lookup(const WordEntry &, WordEntry &) const;
264     bool soundslike_lookup(ParmString, WordEntry &) const;
265 
266     SoundslikeEnumeration * soundslike_elements() const;
267 
268   };
269 
convert(const char * w,WordEntry & o)270   static inline void convert(const char * w, WordEntry & o) {
271     o.what = WordEntry::Word;
272     o.word = w;
273     o.aff  = get_affix(w);
274     o.word_size = get_word_size(w);
275     o.word_info = get_flags(w) & WORD_INFO_MASK;
276   }
277 
278   //
279   //
280   //
281 
282   struct ReadOnlyDict::Elements : public WordEntryEnumeration
283   {
284     const char * w;
285     WordEntry wi;
Elements__anon395da9830111::ReadOnlyDict::Elements286     Elements(const char * w0) : w(w0) {wi.what = WordEntry::Word;}
next__anon395da9830111::ReadOnlyDict::Elements287     WordEntry * next() {
288       if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3
289                                       //        when freq info is used
290       if (get_offset(w) == 0) return 0;
291       convert(w, wi);
292       w = get_next(w);
293       return &wi;
294     }
at_end__anon395da9830111::ReadOnlyDict::Elements295     bool at_end() const {return get_offset(w) == 0;}
clone__anon395da9830111::ReadOnlyDict::Elements296     WordEntryEnumeration * clone() const {return new Elements(*this);}
assign__anon395da9830111::ReadOnlyDict::Elements297     void assign (const WordEntryEnumeration * other) {
298       *this = *static_cast<const Elements *>(other);}
299   };
300 
detailed_elements() const301   WordEntryEnumeration * ReadOnlyDict::detailed_elements() const {
302     return new Elements(first_word);
303   }
304 
low_level_dump() const305   void ReadOnlyDict::low_level_dump() const {
306     bool next_dup = false;
307     const char * w = first_word;
308     for (;;) {
309       if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3
310                                       //        when freq info is used
311       if (get_offset(w) == 0) break;
312 
313       const char * aff = get_affix(w);
314       byte flags = get_flags(w);
315       byte word_info = flags & WORD_INFO_MASK;
316       byte offset = get_offset(w);
317       int size = get_word_size(w);
318       if (next_dup) printf("\\");
319       printf("%s", w);
320       if (flags & HAVE_AFFIX_FLAG) printf("/%s", aff);
321       if (word_info) printf(" [WI: %d]", word_info);
322       //if (flags & DUPLICATE_FLAG) printf(" [NEXT DUP]");
323       const char * p = w;
324       WordLookup::const_iterator i = word_lookup.find(w);
325       if (!next_dup) {
326         if (i == word_lookup.end())
327           printf(" <BAD HASH>");
328         else if (word_block + *i != w) {
329           printf(" <BAD HASH, got %s>", word_block + *i);
330         }
331         else
332           printf(" <hash ok>");
333       }
334       printf("\n");
335       String buf;
336       if (flags & DUPLICATE_FLAG) next_dup = true;
337       else next_dup = false;
338       w = get_next(w);
339     }
340   }
341 
check_hash_fun() const342   PosibErr<void> ReadOnlyDict::check_hash_fun() const {
343     const char * w = first_word;
344     for (;;) {
345       if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3
346                                       //        when freq info is used
347       if (get_offset(w) == 0) break;
348       if (get_word_size(w) >= 12) {
349         const char * p = w;
350         int clean_size = 0;
351         for (;;) {
352           if (!*p) goto next; // reached end before clean_size was at
353                               // least 12, thus skip
354           if (lang()->to_clean(*p)) ++clean_size;
355           if (clean_size >= 12) goto clean_size_ok;
356           ++p;
357         }
358       clean_size_ok:
359         WordLookup::const_iterator i = word_lookup.find(w);
360         if (i == word_lookup.end() || word_block + *i != w)
361           return make_err(bad_file_format, file_name(),
362                           _("Incompatible hash function."));
363         else
364           return no_err;
365       }
366     next:
367       while (get_flags(w) & DUPLICATE_FLAG)
368         w = get_next(w);
369       w = get_next(w);
370     }
371     return no_err;
372   }
373 
size() const374   ReadOnlyDict::Size ReadOnlyDict::size() const {
375     return word_lookup.size();
376   }
377 
empty() const378   bool ReadOnlyDict::empty() const {
379     return word_lookup.empty();
380   }
381 
382   static const char * const cur_check_word = "aspell default speller rowl 1.10";
383 
384   struct DataHead {
385     // all sizes except the last four must to divisible by:
386     static const unsigned int align = 16;
387     char check_word[64];
388     u32int endian_check; // = 12345678
389     char lang_hash[16];
390 
391     u32int head_size;
392     u32int block_size;
393     u32int jump1_offset;
394     u32int jump2_offset;
395     u32int word_offset;
396     u32int hash_offset;
397 
398     u32int word_count;
399     u32int word_buckets;
400     u32int soundslike_count;
401 
402     u32int dict_name_size;
403     u32int lang_name_size;
404     u32int soundslike_name_size;
405     u32int soundslike_version_size;
406 
407     u32int first_word_offset; // from word block
408 
409     byte affix_info; // 0 = none, 1 = partially expanded, 2 = full
410     byte invisible_soundslike;
411     byte soundslike_root_only;
412     byte compound_info; //
413     byte freq_info;
414   };
415 
load(ParmString f0,Config & config,DictList *,SpellerImpl *)416   PosibErr<void> ReadOnlyDict::load(ParmString f0, Config & config,
417                                     DictList *, SpellerImpl *)
418   {
419     set_file_name(f0);
420     const char * fn = file_name();
421 
422     FStream f;
423     RET_ON_ERR(f.open(fn, "rb"));
424 
425     DataHead data_head;
426 
427     f.read(&data_head, sizeof(DataHead));
428 
429 #if 0
430     COUT << "Head Size: " << data_head.head_size << "\n";
431     COUT << "Data Block Size: " << data_head.data_block_size << "\n";
432     COUT << "Hash Block Size: " << data_head.hash_block_size << "\n";
433     COUT << "Total Block Size: " << data_head.total_block_size << "\n";
434 #endif
435 
436     if (strcmp(data_head.check_word, cur_check_word) != 0)
437       return make_err(bad_file_format, fn);
438 
439     if (data_head.endian_check != 12345678)
440       return make_err(bad_file_format, fn, _("Wrong endian order."));
441 
442     CharVector word;
443 
444     word.resize(data_head.dict_name_size);
445     f.read(word.data(), data_head.dict_name_size);
446 
447     word.resize(data_head.lang_name_size);
448     f.read(word.data(), data_head.lang_name_size);
449 
450     PosibErr<void> pe = set_check_lang(word.data(),config);
451     if (pe.has_err()) {
452       if (pe.prvw_err()->is_a(language_related_error))
453         return pe.with_file(fn);
454       else
455         return pe;
456     }
457 
458     if (data_head.soundslike_name_size != 0) {
459       word.resize(data_head.soundslike_name_size);
460       f.read(word.data(), data_head.soundslike_name_size);
461 
462       if (strcmp(word.data(), lang()->soundslike_name()) != 0)
463         return make_err(bad_file_format, fn, _("Wrong soundslike."));
464 
465       word.resize(data_head.soundslike_version_size);
466       f.read(word.data(), data_head.soundslike_version_size);
467 
468       if (strcmp(word.data(), lang()->soundslike_version()) != 0)
469         return make_err(bad_file_format, fn, _("Wrong soundslike version."));
470     }
471 
472     invisible_soundslike = data_head.invisible_soundslike;
473     soundslike_root_only = data_head.soundslike_root_only;
474 
475     affix_compressed = data_head.affix_info;
476 
477     block_size = data_head.block_size;
478     int offset = data_head.head_size;
479     mmaped_block = mmap_open(block_size + offset, f, 0);
480     if( mmaped_block != (char *)MAP_FAILED) {
481       block = mmaped_block + offset;
482       mmaped_size = block_size + offset;
483     } else {
484       mmaped_block = 0;
485       block = (char *)malloc(block_size);
486       f.seek(data_head.head_size);
487       f.read(block, block_size);
488     }
489 
490     if (data_head.jump2_offset) {
491       fast_scan = true;
492       jump1 = reinterpret_cast<const Jump *>(block + data_head.jump1_offset);
493       jump2 = reinterpret_cast<const Jump *>(block + data_head.jump2_offset);
494     } else {
495       jump1 = jump2 = 0;
496     }
497 
498     word_block = block + data_head.word_offset;
499     first_word = word_block + data_head.first_word_offset;
500 
501     word_lookup.parms().block_begin = word_block;
502     word_lookup.parms().hash .lang     = lang();
503     word_lookup.parms().equal.cmp.lang = lang();
504     const u32int * begin = reinterpret_cast<const u32int *>
505       (block + data_head.hash_offset);
506     word_lookup.vector().set(begin, begin + data_head.word_buckets);
507     word_lookup.set_size(data_head.word_count);
508 
509     //low_level_dump();
510     RET_ON_ERR(check_hash_fun());
511 
512     return no_err;
513   }
514 
515   void lookup_adv(WordEntry * wi);
516 
prep_next(WordEntry * wi,const char * w,const SensitiveCompare * c,const char * orig)517   static inline void prep_next(WordEntry * wi,
518                                const char * w,
519                                const SensitiveCompare * c,
520                                const char * orig)
521   {
522   loop:
523     if (!duplicate_flag(w)) return;
524     w = get_next(w);
525     if (!(*c)(orig, w)) goto loop;
526     wi->intr[0] = (void *)w;
527     wi->intr[1] = (void *)c;
528     wi->intr[2] = (void *)orig;
529     wi->adv_ = lookup_adv;
530   }
531 
lookup_adv(WordEntry * wi)532   void lookup_adv(WordEntry * wi)
533   {
534     const char * w = (const char *)wi->intr[0];
535     const SensitiveCompare * c = (const SensitiveCompare *)wi->intr[1];
536     const char * orig = (const char *)wi->intr[2];
537     convert(w,*wi);
538     wi->adv_ = 0;
539     prep_next(wi, w, c, orig);
540   }
541 
lookup(ParmString word,const SensitiveCompare * c,WordEntry & o) const542   bool ReadOnlyDict::lookup(ParmString word, const SensitiveCompare * c,
543                             WordEntry & o) const
544   {
545     o.clear();
546     WordLookup::const_iterator i = word_lookup.find(word);
547     if (i == word_lookup.end()) return false;
548     const char * w = word_block + *i;
549     for (;;) {
550       if ((*c)(word, w)) {
551         convert(w,o);
552         prep_next(&o, w, c, word);
553         return true;
554       }
555       if (!duplicate_flag(w)) break;
556       w = get_next(w);
557     }
558     return false;
559   }
560 
561   struct ReadOnlyDict::SoundslikeElements : public SoundslikeEnumeration
562   {
563     WordEntry data;
564     const ReadOnlyDict * obj;
565     const Jump * jump1;
566     const Jump * jump2;
567     const char * cur;
568     const char * prev;
569     int level;
570     bool invisible_soundslike;
571 
572     WordEntry * next(int stopped_at);
573 
SoundslikeElements__anon395da9830111::ReadOnlyDict::SoundslikeElements574     SoundslikeElements(const ReadOnlyDict * o)
575       : obj(o), jump1(obj->jump1), jump2(obj->jump2), cur(0),
576         level(1), invisible_soundslike(o->invisible_soundslike) {
577       data.what = o->invisible_soundslike ? WordEntry::Word : WordEntry::Soundslike;}
578   };
579 
next(int stopped_at)580   WordEntry * ReadOnlyDict::SoundslikeElements::next(int stopped_at) {
581 
582     //CERR << level << ":" << stopped_at << "  :";
583     //CERR << jump1->sl << ":" << jump2->sl << "\n";
584 
585   loop:
586 
587     const char * tmp = cur;
588     const char * p;
589 
590     if (level == 1 && stopped_at < 2) {
591 
592       ++jump1;
593       tmp = jump1->sl;
594       goto jquit;
595 
596     } else if (level == 2 && stopped_at < 3) {
597 
598       ++jump2;
599       if (jump2[-1].sl[1] != jump2[0].sl[1]) {
600         ++jump1;
601         level = 1;
602         tmp = jump1->sl;
603       } else {
604         tmp = jump2->sl;
605       }
606       goto jquit;
607 
608     } else if (level == 1) {
609 
610       level = 2;
611       jump2 = obj->jump2 + jump1->loc;
612       tmp = jump2->sl;
613       goto jquit;
614 
615     } else if (level == 2) {
616 
617       tmp = cur = obj->word_block + jump2->loc;
618       level = 3;
619 
620     } else if (get_offset(cur) == 0) {
621 
622       level = 2;
623       ++jump2;
624       if (jump2[-1].sl[1] != jump2[0].sl[1]) {
625         level = 1;
626         ++jump1;
627         tmp = jump1->sl;
628       } else {
629         tmp = jump2->sl;
630       }
631       goto jquit;
632 
633     }
634 
635     cur = get_next(cur); // this will be the NEXT item looked at
636 
637     p = prev;
638     prev = tmp;
639     if (p) {
640       // PRECOND:
641       // unless stopped_at >= LARGE_NUM
642       //     strlen(p) >= stopped_at
643       //     (stopped_at >= 3) implies
644       //         strncmp(p, tmp, 3) == 0 if !invisible_soundslike
645       //         strncmp(to_sl(p), to_sl(tmp), 3) == 0 if invisible_soundslike
646       if (stopped_at == 3) {
647         if (p[3] == tmp[3]) goto loop;
648       } else if (stopped_at == 4) {
649         if (p[3] == tmp[3] && tmp[3] &&
650             p[4] == tmp[4]) goto loop;
651       } else if (stopped_at == 5) {
652         if (p[3] == tmp[3] && tmp[3] &&
653             p[4] == tmp[4] && tmp[4] &&
654             p[5] == tmp[5]) goto loop;
655       }
656     }
657 
658     data.word = tmp;
659     data.word_size = get_word_size(tmp);
660     if (invisible_soundslike) {
661       convert(tmp, data);
662     }
663     data.intr[0] = (void *)tmp;
664 
665     return &data;
666 
667   jquit:
668     prev = 0;
669     if (!*tmp) return 0;
670     data.word = tmp;
671     data.word_size = !tmp[1] ? 1 : !tmp[2] ? 2 : 3;
672     data.intr[0] = 0;
673     if (invisible_soundslike) {
674       data.what = WordEntry::Clean;
675       data.aff  = 0;
676     }
677     return &data;
678   }
679 
soundslike_elements() const680   SoundslikeEnumeration * ReadOnlyDict::soundslike_elements() const {
681 
682     return new SoundslikeElements(this);
683 
684   }
685 
soundslike_next(WordEntry * w)686   static void soundslike_next(WordEntry * w)
687   {
688     const char * cur = (const char *)(w->intr[0]);
689     const char * end = (const char *)(w->intr[1]);
690     convert(cur, *w);
691     cur = get_next(cur);
692     w->intr[0] = (void *)cur;
693     if (cur >= end) w->adv_ = 0;
694   }
695 
clean_lookup_adv(WordEntry * wi)696   static void clean_lookup_adv(WordEntry * wi)
697   {
698     const char * w = wi->word;
699     w = get_next(w);
700     convert(w,*wi);
701     if (!duplicate_flag(w)) wi->adv_ = 0;
702   }
703 
clean_lookup(ParmString sl,WordEntry & o) const704   bool ReadOnlyDict::clean_lookup(ParmString sl, WordEntry & o) const
705   {
706     o.clear();
707     WordLookup::const_iterator i = word_lookup.find(sl);
708     if (i == word_lookup.end()) return false;
709     const char * w = word_block + *i;
710     convert(w, o);
711     if (duplicate_flag(w)) o.adv_ = clean_lookup_adv;
712     return true;
713   }
714 
soundslike_lookup(const WordEntry & s,WordEntry & w) const715   bool ReadOnlyDict::soundslike_lookup(const WordEntry & s, WordEntry & w) const
716   {
717     if (s.intr[0] == 0) {
718 
719       return false;
720 
721     } else if (!invisible_soundslike) {
722 
723       w.clear();
724       w.what = WordEntry::Word;
725       w.intr[0] = (void *)get_sl_words_begin(s.word);
726       w.intr[1] = (void *)get_sl_words_end(s.word);
727       w.adv_ = soundslike_next;
728       soundslike_next(&w);
729       return true;
730 
731     } else {
732 
733       w.clear();
734       w.what = WordEntry::Word;
735       convert(s.word, w);
736       return true;
737 
738     }
739   }
740 
soundslike_lookup(ParmString s,WordEntry & w) const741   bool ReadOnlyDict::soundslike_lookup(ParmString s, WordEntry & w) const
742   {
743     if (invisible_soundslike) {
744       return ReadOnlyDict::clean_lookup(s,w);
745     } else {
746       return false;
747     }
748   }
749 
750 }
751 
752 namespace aspeller {
753 
new_default_readonly_dict()754   Dictionary * new_default_readonly_dict() {
755     return new ReadOnlyDict();
756   }
757 
758 }
759 
760 namespace {
761 
762   // Possible:
763   //   No Affix Compression:
764   //     no soundslike
765   //     invisible soundslike
766   //     with soundslike
767   //   Affix Compression:
768   //     group by root:
769   //       no soundslike
770   //       invisible soundslike
771   //       with soundslike
772   //     expand prefix:
773   //       no soundslike
774   //       invisible soundslike
775 
776   using namespace aspeller;
777 
778   struct WordData {
779     static const unsigned struct_size;
780     WordData * next;
781     char * sl;
782     char * aff;
783     byte word_size;
784     byte sl_size;
785     byte data_size;
786     byte flags;
787     char word[1];
788   };
789 
790   const unsigned WordData::struct_size = sizeof(WordData) - 1;
791 
792 
793   struct SoundslikeLess {
794     InsensitiveCompare icomp;
SoundslikeLess__anon395da9830211::SoundslikeLess795     SoundslikeLess(const Language * l) : icomp(l) {}
operator ()__anon395da9830211::SoundslikeLess796     bool operator() (WordData * x, WordData * y) const {
797       int res = strcmp(x->sl, y->sl);
798       if (res != 0) return res < 0;
799       res = icomp(x->word, y->word);
800       if (res != 0) return res < 0;
801       return strcmp(x->word, y->word) < 0;
802     }
803   };
804 
805   struct WordLookupParms {
806     const char * block_begin;
WordLookupParms__anon395da9830211::WordLookupParms807     WordLookupParms() {}
808     typedef acommon::Vector<u32int> Vector;
809     typedef u32int              Value;
810     typedef const char *        Key;
811     static const bool is_multi = false;
key__anon395da9830211::WordLookupParms812     Key key(Value v) const {return block_begin + v;}
813     InsensitiveHash<hash_int_t> hash;
814     InsensitiveEqual equal;
is_nonexistent__anon395da9830211::WordLookupParms815     bool is_nonexistent(Value v) const {return v == u32int_max;}
make_nonexistent__anon395da9830211::WordLookupParms816     void make_nonexistent(Value & v) const {v = u32int_max;}
817   };
818   typedef VectorHashTable<WordLookupParms> WordLookup;
819 
round_up(unsigned int i,unsigned int size)820   static inline unsigned int round_up(unsigned int i, unsigned int size) {
821     return ((i + size - 1)/size)*size;
822   }
823 
advance_file(FStream & out,int pos)824   static void advance_file(FStream & out, int pos) {
825     int diff = pos - out.tell();
826     assert(diff >= 0);
827     for(; diff != 0; --diff)
828       out << '\0';
829   }
830 
create(StringEnumeration * els,const Language & lang,Config & config)831   PosibErr<void> create (StringEnumeration * els,
832 			 const Language & lang,
833                          Config & config)
834   {
835     assert(sizeof(u16int) == 2);
836     assert(sizeof(u32int) == 4);
837 
838     bool full_soundslike = !(strcmp(lang.soundslike_name(), "none") == 0 ||
839                              strcmp(lang.soundslike_name(), "stripped") == 0 ||
840                              strcmp(lang.soundslike_name(), "simple") == 0);
841 
842     bool affix_compress = (lang.affix() &&
843                            config.retrieve_bool("affix-compress"));
844 
845     bool partially_expand = (affix_compress &&
846                              !full_soundslike &&
847                              config.retrieve_bool("partially-expand"));
848 
849     bool invisible_soundslike = false;
850     if (partially_expand)
851       invisible_soundslike = true;
852     else if (config.have("invisible-soundslike"))
853       invisible_soundslike = config.retrieve_bool("invisible-soundslike");
854     else if (!full_soundslike)
855       invisible_soundslike = true;
856 
857     ConvEC iconv;
858     if (!config.have("norm-strict"))
859       config.replace("norm-strict", "true");
860     if (config.have("encoding")) {
861       String enc = config.retrieve("encoding");
862       RET_ON_ERR(iconv.setup(config, enc, lang.charmap(), NormFrom));
863     } else {
864       RET_ON_ERR(iconv.setup(config, lang.data_encoding(), lang.charmap(), NormFrom));
865     }
866 
867     String base = config.retrieve("master-path");
868 
869     DataHead data_head;
870     memset(&data_head, 0, sizeof(data_head));
871     strcpy(data_head.check_word, cur_check_word);
872 
873     data_head.endian_check = 12345678;
874 
875     data_head.dict_name_size = 1;
876     data_head.lang_name_size = strlen(lang.name()) + 1;
877     data_head.soundslike_name_size    = strlen(lang.soundslike_name()) + 1;
878     data_head.soundslike_version_size = strlen(lang.soundslike_version()) + 1;
879     data_head.head_size  = sizeof(DataHead);
880     data_head.head_size += data_head.dict_name_size;
881     data_head.head_size += data_head.lang_name_size;
882     data_head.head_size += data_head.soundslike_name_size;
883     data_head.head_size += data_head.soundslike_version_size;
884     data_head.head_size  = round_up(data_head.head_size, DataHead::align);
885 
886     data_head.affix_info = affix_compress ? partially_expand ? 1 : 2 : 0;
887     data_head.invisible_soundslike = invisible_soundslike;
888     data_head.soundslike_root_only = affix_compress  && !partially_expand ? 1 : 0;
889 
890 #if 0
891     CERR.printl("FLAGS:  ");
892     if (full_soundslike) CERR.printl("  full soundslike");
893     if (invisible_soundslike) CERR.printl("  invisible soundslike");
894     if (data_head.soundslike_root_only) CERR.printl("  soundslike root only");
895     if (affix_compress) CERR.printl("  affix compress");
896     if (partially_expand) CERR.printl("  partially expand");
897     CERR.printl("---");
898 #endif
899 
900     String temp;
901 
902     int num_entries = 0;
903     int uniq_entries = 0;
904 
905     ObjStack buf(16*1024);
906     String sl_buf;
907 
908     WordData * first = 0;
909 
910     //
911     // Read in Wordlist
912     //
913     {
914       WordListIterator wl_itr(els, &lang, config.retrieve_bool("warn") ? &CERR : 0);
915       wl_itr.init(config);
916       ObjStack exp_buf;
917       WordAff * exp_list;
918       WordAff single;
919       single.next = 0;
920       Vector<WordAff> af_list;
921       WordData * * prev = &first;
922 
923       for (;;) {
924 
925         PosibErr<bool> pe = wl_itr.adv();
926         if (pe.has_err()) return pe;
927         if (!pe.data) break;
928 
929         const char * w = wl_itr->word.str;
930         unsigned int s = wl_itr->word.size;
931 
932         const char * affixes = wl_itr->aff.str;
933 
934         if (*affixes && !lang.affix())
935           return make_err(other_error,
936                           _("Affix flags found in word but no affix file given."));
937 
938         if (*affixes && !affix_compress) {
939           exp_buf.reset();
940           exp_list = lang.affix()->expand(w, affixes, exp_buf);
941         } else if (*affixes && partially_expand) {
942           // expand any affixes which will effect the first
943           // 3 letters of a word.  This is needed so that the
944           // jump tables will function correctly
945           exp_buf.reset();
946           exp_list = lang.affix()->expand(w, affixes, exp_buf, 3);
947         } else {
948           single.word.str = w;
949           single.word.size = strlen(w);
950           single.aff = (const byte *)affixes;
951           exp_list = &single;
952         }
953 
954         // iterate through each expanded word
955 
956         for (WordAff * p = exp_list; p; p = p->next)
957         {
958           const char * w = p->word.str;
959           s = p->word.size;
960 
961           unsigned total_size = WordData::struct_size;
962           unsigned data_size = s + 1;
963           unsigned aff_size = strlen((const char *)p->aff);
964           if (aff_size > 0) data_size += aff_size + 1;
965           total_size += data_size;
966           lang.to_soundslike(sl_buf, w);
967           const char * sl = sl_buf.str();
968           unsigned sl_size = sl_buf.size();
969           if (strcmp(sl,w) == 0) sl = w;
970           if (sl != w) total_size += sl_size + 1;
971 
972           if (total_size - WordData::struct_size > 240)
973             return make_err(invalid_word, MsgConv(lang)(w),
974                             _("The total word length, with soundslike data, is larger than 240 characters."));
975 
976           WordData * b = (WordData *)buf.alloc(total_size, sizeof(void *));
977           *prev = b;
978           b->next = 0;
979           prev = &b->next;
980 
981           b->word_size = s;
982           b->sl_size = strlen(sl);
983           b->data_size = data_size;
984           b->flags = lang.get_word_info(w);
985 
986           char * z = b->word;
987 
988           memcpy(z, w, s + 1);
989           z += s + 1;
990 
991           if (aff_size > 0) {
992             b->flags |= HAVE_AFFIX_FLAG;
993             b->aff = z;
994             memcpy(z, p->aff, aff_size + 1);
995             z += aff_size + 1;
996           } else {
997             b->aff = 0;
998           }
999 
1000           if (sl != w) {
1001             memcpy(z, sl, sl_size + 1);
1002             b->sl = z;
1003           } else {
1004             b->sl = b->word;
1005           }
1006 
1007         }
1008       }
1009       delete els;
1010     }
1011 
1012     //
1013     // sort WordData linked list based on (sl, word)
1014     //
1015 
1016     first = sort(first, SoundslikeLess(&lang));
1017 
1018     //
1019     // duplicate check
1020     //
1021     WordData * prev = first;
1022     WordData * cur = first ? first->next : 0;
1023     InsensitiveEqual ieq(&lang);
1024     while (cur) {
1025       if (strcmp(prev->word, cur->word) == 0) {
1026         // merge affix info if necessary
1027         if (!prev->aff && cur->aff) {
1028           prev->flags |= HAVE_AFFIX_FLAG;
1029           prev->aff = cur->aff;
1030           prev->data_size += strlen(prev->aff) + 1;
1031         } else if (prev->aff && cur->aff) {
1032           unsigned l1 = strlen(prev->aff);
1033           unsigned l2 = strlen(cur->aff);
1034           char * aff = (char *)buf.alloc(l1 + l2 + 1);
1035           memcpy(aff, prev->aff, l1);
1036           prev->aff = aff;
1037           aff += l1;
1038           for (const char * p = cur->aff; *p; ++p) {
1039             if (memchr(prev->aff, *p, l1)) continue;
1040             *aff = *p;
1041             ++aff;
1042           }
1043           *aff = '\0';
1044           prev->data_size = prev->word_size + (aff - prev->aff) + 2;
1045         }
1046         prev->next = cur->next;
1047       } else {
1048         if (ieq(prev->word, cur->word)) prev->flags |= DUPLICATE_FLAG;
1049         else ++uniq_entries;
1050         ++num_entries;
1051         prev = cur;
1052       }
1053       cur = cur->next;
1054     }
1055 
1056     //
1057     //
1058     //
1059 
1060     unsigned data_size = 16;
1061     WordData * p = first;
1062     if (invisible_soundslike) {
1063 
1064       for (; p; p = p->next)
1065         data_size += 3 + p->data_size;
1066 
1067     } else {
1068 
1069       while (p)
1070       {
1071         unsigned ds = 2 + p->sl_size + 1;
1072 
1073         char * prev = p->sl;
1074 
1075         do {
1076 
1077           ds += 3 + p->data_size;
1078           p->sl = prev;
1079           p = p->next;
1080 
1081         } while (p && strcmp(prev, p->sl) == 0 && ds + 3 + p->data_size < 255);
1082 
1083         data_size += ds;
1084 
1085       }
1086 
1087     }
1088 
1089     //
1090     // Create the final data structures
1091     //
1092 
1093     CharVector     data;
1094     data.reserve(data_size);
1095     data.write32(0); // to avoid nasty special cases
1096     unsigned int prev_pos = data.size();
1097     data.write32(0);
1098     unsigned prev_w_pos = data.size();
1099 
1100     WordLookup lookup(affix_compress
1101                       ? uniq_entries * 3 / 2
1102                       : uniq_entries * 5 / 4);
1103     lookup.parms().block_begin = data.begin();
1104     lookup.parms().hash .lang     = &lang;
1105     lookup.parms().equal.cmp.lang = &lang;
1106 
1107     Vector<Jump> jump1;
1108     Vector<Jump> jump2;
1109 
1110     const int head_size = invisible_soundslike ? 3 : 2;
1111 
1112     const char * prev_sl = "";
1113     p = first;
1114     while (p)
1115     {
1116       if (invisible_soundslike) {
1117 
1118         data.write(p->flags); // flags
1119         data.write('\0'); // place holder for offset to next item
1120         data.write(p->word_size);
1121 
1122       } else {
1123 
1124         data.write('\0'); // place holder for offset to next item
1125         data.write(p->sl_size);
1126 
1127       }
1128 
1129       if (strncmp(prev_sl, p->sl, 3) != 0) {
1130 
1131         Jump jump;
1132         strncpy(jump.sl, p->sl, 3);
1133         jump.loc = data.size();
1134         jump2.push_back(jump);
1135 
1136         if (strncmp(prev_sl, p->sl, 2) != 0) {
1137           Jump jump;
1138           strncpy(jump.sl, p->sl, 2);
1139           jump.loc = jump2.size() - 1;
1140           jump1.push_back(jump);
1141         }
1142 
1143         data[prev_pos - NEXT_O] = (byte)(data.size() - prev_pos - head_size + 1);
1144         // when advanced to this position the offset byte will
1145         // be null (since it will point to the null terminator
1146         // of the last word) and will thus signal the end of the
1147         // group
1148 
1149       } else {
1150 
1151         data[prev_pos - NEXT_O] = (byte)(data.size() - prev_pos);
1152 
1153       }
1154 
1155       prev_pos = data.size();
1156       prev_sl = p->sl;
1157 
1158       if (invisible_soundslike) {
1159 
1160         unsigned pos = data.size();
1161         prev_w_pos = data.size();
1162         data.write(p->word, p->word_size + 1);
1163         if (p->aff) data.write(p->aff, p->data_size - p->word_size - 1);
1164         lookup.insert(pos);
1165 
1166         p = p->next;
1167 
1168       } else {
1169 
1170         data.write(p->sl, p->sl_size + 1);
1171 
1172         // write all word entries with the same soundslike
1173 
1174         do {
1175           data.write(p->flags);
1176           data.write(p->data_size + 3);
1177           data.write(p->word_size);
1178 
1179           unsigned pos = data.size();
1180           data[prev_w_pos - NEXT_O] = (byte)(pos - prev_w_pos);
1181           data.write(p->word, p->word_size + 1);
1182           if (p->aff) data.write(p->aff, p->data_size - p->word_size - 1);
1183           lookup.insert(pos);
1184 
1185           prev_w_pos = pos;
1186           prev_sl = p->sl;
1187 
1188           p = p->next;
1189 
1190         } while (p && prev_sl == p->sl); // yes I really mean to use pointer compare here
1191       }
1192     }
1193 
1194     // add special end case
1195     if (data.size() % 2 != 0) data.write('\0');
1196     data.write16(0);
1197     data.write16(0);
1198     data[prev_pos - NEXT_O] |= (byte)(data.size() - prev_pos);
1199 
1200     jump2.push_back(Jump());
1201     jump1.push_back(Jump());
1202 
1203     data.write(0);
1204     data.write(0);
1205     data.write(0);
1206 
1207     if (invisible_soundslike)
1208       data_head.first_word_offset = data[4 - NEXT_O] + 4;
1209     else
1210       data_head.first_word_offset = data[8 - NEXT_O] + 8;
1211 
1212     memset(data.data(), 0, 8);
1213 
1214     //CERR.printf("%d == %d\n", lookup.size(), uniq_entries);
1215     //assert(lookup.size() == uniq_entries);
1216 
1217     data_head.word_count   = num_entries;
1218     data_head.word_buckets = lookup.bucket_count();
1219 
1220     FStream out;
1221     out.open(base, "wb");
1222 
1223     advance_file(out, data_head.head_size);
1224 
1225     // Write jump1 table
1226     data_head.jump1_offset = out.tell() - data_head.head_size;
1227     out.write(jump1.data(), jump1.size() * sizeof(Jump));
1228 
1229     // Write jump2 table
1230     advance_file(out, round_up(out.tell(), DataHead::align));
1231     data_head.jump2_offset = out.tell() - data_head.head_size;
1232     out.write(jump2.data(), jump2.size() * sizeof(Jump));
1233 
1234     // Write data block
1235     advance_file(out, round_up(out.tell(), DataHead::align));
1236     data_head.word_offset = out.tell() - data_head.head_size;
1237     out.write(data.data(), data.size());
1238 
1239     // Write hash
1240     advance_file(out, round_up(out.tell(), DataHead::align));
1241     data_head.hash_offset = out.tell() - data_head.head_size;
1242     out.write(&lookup.vector().front(), lookup.vector().size() * 4);
1243 
1244     // calculate block size
1245     advance_file(out, round_up(out.tell(), DataHead::align));
1246     data_head.block_size = out.tell() - data_head.head_size;
1247 
1248     // write data head to file
1249     out.seek(0);
1250     out.write(&data_head, sizeof(DataHead));
1251     out.write(" ", 1);
1252     out.write(lang.name(), data_head.lang_name_size);
1253     out.write(lang.soundslike_name(), data_head.soundslike_name_size);
1254     out.write(lang.soundslike_version(), data_head.soundslike_version_size);
1255 
1256     return no_err;
1257   }
1258 
1259 }
1260 
1261 namespace aspeller {
create_default_readonly_dict(StringEnumeration * els,Config & config)1262   PosibErr<void> create_default_readonly_dict(StringEnumeration * els,
1263                                               Config & config)
1264   {
1265     CachePtr<Language> lang;
1266     PosibErr<Language *> res = new_language(config);
1267     if (res.has_err()) return res;
1268     lang.reset(res.data);
1269     lang->set_lang_defaults(config);
1270     RET_ON_ERR(create(els,*lang,config));
1271     return no_err;
1272   }
1273 }
1274 
1275