1 // This file is part of The New Aspell
2 // Copyright (C) 2000-2001,2011 by Kevin Atkinson under the GNU LGPL
3 // license version 2.0 or 2.1. You should have received a copy of the
4 // LGPL license along with this library if you did not you can find it
5 // at http://www.gnu.org/.
6
7 // Aspell's main word list is laid out as follows:
8 //
9 // * header
10 // * jump table for editdist 1
11 // * jump table for editdist 2
12 // * data block
13 // * hash table
14
15 // data block laid out as follows:
16 //
17 // Words:
18 // (<8 bit frequency><8 bit: flags><8 bit: offset to next word>
19 // <8 bit: word size><word><null>
20 // [<affix info><null>][<category info><null>])+
21 // Words with soundslike:
22 // (<8 bit: offset to next item><8 bit: soundslike size><soundslike>
23 // <words with that soundlike>)+
24 // Flags are mapped as follows:
25 // bits 0-3: word info
26 // bit 4: duplicate flag
27 // bit 5: <unused>
28 // bit 6: have affix info
29 // bit 7: have compound info
30
31 #include <utility>
32 using std::pair;
33
34 #include <string.h>
35 #include <stdio.h>
36 //#include <errno.h>
37
38 #include "settings.h"
39
40 #include "block_vector.hpp"
41 #include "config.hpp"
42 #include "data.hpp"
43 #include "data_util.hpp"
44 #include "errors.hpp"
45 #include "file_util.hpp"
46 #include "fstream.hpp"
47 #include "language.hpp"
48 #include "stack_ptr.hpp"
49 #include "objstack.hpp"
50 #include "vector.hpp"
51 #include "vector_hash-t.hpp"
52 #include "check_list.hpp"
53 #include "lsort.hpp"
54
55 #include "iostream.hpp"
56
57 #include "gettext.h"
58
59 typedef unsigned int u32int;
60 static const u32int u32int_max = (u32int)-1;
61 typedef unsigned short u16int;
62 typedef unsigned char byte;
63
64 #ifdef USE_32_BIT_HASH_FUN
65 typedef u32int hash_int_t;
66 #else
67 typedef size_t hash_int_t;
68 #endif
69
70 #ifdef HAVE_MMAP
71
72 // POSIX headers
73 #include <fcntl.h>
74 #include <unistd.h>
75 #include <sys/mman.h>
76
77 #endif
78
79 #ifndef MAP_FAILED
80 #define MAP_FAILED (-1)
81 #endif
82
83 #ifdef HAVE_MMAP
84
mmap_open(unsigned int block_size,FStream & f,unsigned int offset)85 static inline char * mmap_open(unsigned int block_size,
86 FStream & f,
87 unsigned int offset)
88 {
89 f.flush();
90 int fd = f.file_no();
91 return static_cast<char *>
92 (mmap(NULL, block_size, PROT_READ, MAP_SHARED, fd, offset));
93 }
94
mmap_free(char * block,unsigned int size)95 static inline void mmap_free(char * block, unsigned int size)
96 {
97 munmap(block, size);
98 }
99
100 #else
101
mmap_open(unsigned int,FStream & f,unsigned int)102 static inline char * mmap_open(unsigned int,
103 FStream & f,
104 unsigned int)
105 {
106 return reinterpret_cast<char *>(MAP_FAILED);
107 }
108
mmap_free(char *,unsigned int)109 static inline void mmap_free(char *, unsigned int)
110 {
111 abort();
112 }
113
114 #endif
115
116 static byte HAVE_AFFIX_FLAG = 1 << 7;
117 static byte HAVE_CATEGORY_FLAG = 1 << 6;
118
119 static byte DUPLICATE_FLAG = 1 << 4;
120 // this flag is set when there is is more than one word for a
121 // particulear "clean" word such as "jello" "Jello". It is set on all
122 // but the last word of the group. Ie, if it is set than the next
123 // word when converted to its "clean" form equals the same value.
124
125 static byte WORD_INFO_MASK = 0x0F;
126
127 static const int FREQUENCY_INFO_O = 4;
128 static const int FLAGS_O = 3;
129 static const int NEXT_O = 2;
130 static const int WORD_SIZE_O = 1;
131
get_word_size(const char * d)132 static inline int get_word_size(const char * d) {
133 return *reinterpret_cast<const byte *>(d - WORD_SIZE_O);
134 }
135
get_flags(const char * d)136 static inline byte get_flags(const char * d) {
137 return *reinterpret_cast<const byte *>(d - FLAGS_O);
138 }
139
get_offset(const char * d)140 static inline byte get_offset(const char * d) {
141 return *reinterpret_cast<const byte *>(d - NEXT_O);
142 }
143
get_next(const char * d)144 static inline const char * get_next(const char * d) {
145 return d + *reinterpret_cast<const byte *>(d - NEXT_O);
146 }
147
get_sl_words_begin(const char * d)148 static inline const char * get_sl_words_begin(const char * d) {
149 return d + *reinterpret_cast<const byte *>(d - WORD_SIZE_O) + 4;
150 // FIXME: This isn't right when frequency info is stored in the table
151 }
152
153 // get_next might go past the end so don't JUST compare
154 // for equality. Ie use while (cur < end) not (cur != end)
get_sl_words_end(const char * d)155 static inline const char * get_sl_words_end(const char * d) {
156 return get_next(d) - 3;
157 }
158
get_affix(const char * d)159 static inline const char * get_affix(const char * d) {
160 int word_size = get_word_size(d);
161 if (get_flags(d) & HAVE_AFFIX_FLAG)
162 return d + word_size + 1;
163 else
164 return d + word_size;
165 }
166
get_category(const char * d)167 static inline const char * get_category(const char * d) {
168 int word_size = get_word_size(d);
169 if (get_flags(d) & (HAVE_AFFIX_FLAG | HAVE_CATEGORY_FLAG))
170 return d + strlen(d + word_size + 1) + 1;
171 else if (get_flags(d) & HAVE_CATEGORY_FLAG)
172 return d + word_size + 1;
173 else
174 return d + word_size;
175 }
176
duplicate_flag(const char * d)177 static inline bool duplicate_flag(const char * d) {
178 return get_flags(d) & DUPLICATE_FLAG;
179 }
180
181 namespace {
182
183 using namespace aspeller;
184
185 /////////////////////////////////////////////////////////////////////
186 //
187 // ReadOnlyDict
188 //
189
190 struct Jump
191 {
192 char sl[4];
193 u32int loc;
Jump__anon395da9830111::Jump194 Jump() {memset(this, 0, sizeof(Jump));}
195 };
196
197 class ReadOnlyDict : public Dictionary
198 {
199
200 public: //but don't use
201
202 struct WordLookupParms {
203 const char * block_begin;
WordLookupParms__anon395da9830111::ReadOnlyDict::WordLookupParms204 WordLookupParms() {}
205 typedef BlockVector<const u32int> Vector;
206 typedef u32int Value;
207 typedef const char * Key;
208 static const bool is_multi = false;
key__anon395da9830111::ReadOnlyDict::WordLookupParms209 Key key(Value v) const {return block_begin + v;}
210 InsensitiveHash<hash_int_t> hash;
211 InsensitiveEqual equal;
is_nonexistent__anon395da9830111::ReadOnlyDict::WordLookupParms212 bool is_nonexistent(Value v) const {return v == u32int_max;}
make_nonexistent__anon395da9830111::ReadOnlyDict::WordLookupParms213 void make_nonexistent(const Value & v) const {abort();}
214 };
215 typedef VectorHashTable<WordLookupParms> WordLookup;
216
217 public: // but don't use
218
219 char * block;
220 u32int block_size;
221 char * mmaped_block;
222 u32int mmaped_size;
223 const Jump * jump1;
224 const Jump * jump2;
225 WordLookup word_lookup;
226 const char * word_block;
227 const char * first_word;
228
229 ReadOnlyDict(const ReadOnlyDict&);
230 ReadOnlyDict& operator= (const ReadOnlyDict&);
231
232 struct Elements;
233 struct SoundslikeElements;
234
235 public:
236 WordEntryEnumeration * detailed_elements() const;
237 Size size() const;
238 bool empty() const;
239
ReadOnlyDict()240 ReadOnlyDict()
241 : Dictionary(basic_dict, "ReadOnlyDict")
242 {
243 block = 0;
244 }
245
~ReadOnlyDict()246 ~ReadOnlyDict() {
247 if (block != 0) {
248 if (mmaped_block)
249 mmap_free(mmaped_block, mmaped_size);
250 else
251 free(block);
252 }
253 }
254
255 PosibErr<void> load(ParmString, Config &, DictList *, SpellerImpl *);
256 PosibErr<void> check_hash_fun() const;
257 void low_level_dump() const;
258
259 bool lookup(ParmString word, const SensitiveCompare *, WordEntry &) const;
260
261 bool clean_lookup(ParmString, WordEntry &) const;
262
263 bool soundslike_lookup(const WordEntry &, WordEntry &) const;
264 bool soundslike_lookup(ParmString, WordEntry &) const;
265
266 SoundslikeEnumeration * soundslike_elements() const;
267
268 };
269
convert(const char * w,WordEntry & o)270 static inline void convert(const char * w, WordEntry & o) {
271 o.what = WordEntry::Word;
272 o.word = w;
273 o.aff = get_affix(w);
274 o.word_size = get_word_size(w);
275 o.word_info = get_flags(w) & WORD_INFO_MASK;
276 }
277
278 //
279 //
280 //
281
282 struct ReadOnlyDict::Elements : public WordEntryEnumeration
283 {
284 const char * w;
285 WordEntry wi;
Elements__anon395da9830111::ReadOnlyDict::Elements286 Elements(const char * w0) : w(w0) {wi.what = WordEntry::Word;}
next__anon395da9830111::ReadOnlyDict::Elements287 WordEntry * next() {
288 if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3
289 // when freq info is used
290 if (get_offset(w) == 0) return 0;
291 convert(w, wi);
292 w = get_next(w);
293 return &wi;
294 }
at_end__anon395da9830111::ReadOnlyDict::Elements295 bool at_end() const {return get_offset(w) == 0;}
clone__anon395da9830111::ReadOnlyDict::Elements296 WordEntryEnumeration * clone() const {return new Elements(*this);}
assign__anon395da9830111::ReadOnlyDict::Elements297 void assign (const WordEntryEnumeration * other) {
298 *this = *static_cast<const Elements *>(other);}
299 };
300
detailed_elements() const301 WordEntryEnumeration * ReadOnlyDict::detailed_elements() const {
302 return new Elements(first_word);
303 }
304
low_level_dump() const305 void ReadOnlyDict::low_level_dump() const {
306 bool next_dup = false;
307 const char * w = first_word;
308 for (;;) {
309 if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3
310 // when freq info is used
311 if (get_offset(w) == 0) break;
312
313 const char * aff = get_affix(w);
314 byte flags = get_flags(w);
315 byte word_info = flags & WORD_INFO_MASK;
316 byte offset = get_offset(w);
317 int size = get_word_size(w);
318 if (next_dup) printf("\\");
319 printf("%s", w);
320 if (flags & HAVE_AFFIX_FLAG) printf("/%s", aff);
321 if (word_info) printf(" [WI: %d]", word_info);
322 //if (flags & DUPLICATE_FLAG) printf(" [NEXT DUP]");
323 const char * p = w;
324 WordLookup::const_iterator i = word_lookup.find(w);
325 if (!next_dup) {
326 if (i == word_lookup.end())
327 printf(" <BAD HASH>");
328 else if (word_block + *i != w) {
329 printf(" <BAD HASH, got %s>", word_block + *i);
330 }
331 else
332 printf(" <hash ok>");
333 }
334 printf("\n");
335 String buf;
336 if (flags & DUPLICATE_FLAG) next_dup = true;
337 else next_dup = false;
338 w = get_next(w);
339 }
340 }
341
check_hash_fun() const342 PosibErr<void> ReadOnlyDict::check_hash_fun() const {
343 const char * w = first_word;
344 for (;;) {
345 if (get_offset(w) == 0) w += 2; // FIXME: This needs to be 3
346 // when freq info is used
347 if (get_offset(w) == 0) break;
348 if (get_word_size(w) >= 12) {
349 const char * p = w;
350 int clean_size = 0;
351 for (;;) {
352 if (!*p) goto next; // reached end before clean_size was at
353 // least 12, thus skip
354 if (lang()->to_clean(*p)) ++clean_size;
355 if (clean_size >= 12) goto clean_size_ok;
356 ++p;
357 }
358 clean_size_ok:
359 WordLookup::const_iterator i = word_lookup.find(w);
360 if (i == word_lookup.end() || word_block + *i != w)
361 return make_err(bad_file_format, file_name(),
362 _("Incompatible hash function."));
363 else
364 return no_err;
365 }
366 next:
367 while (get_flags(w) & DUPLICATE_FLAG)
368 w = get_next(w);
369 w = get_next(w);
370 }
371 return no_err;
372 }
373
size() const374 ReadOnlyDict::Size ReadOnlyDict::size() const {
375 return word_lookup.size();
376 }
377
empty() const378 bool ReadOnlyDict::empty() const {
379 return word_lookup.empty();
380 }
381
382 static const char * const cur_check_word = "aspell default speller rowl 1.10";
383
384 struct DataHead {
385 // all sizes except the last four must to divisible by:
386 static const unsigned int align = 16;
387 char check_word[64];
388 u32int endian_check; // = 12345678
389 char lang_hash[16];
390
391 u32int head_size;
392 u32int block_size;
393 u32int jump1_offset;
394 u32int jump2_offset;
395 u32int word_offset;
396 u32int hash_offset;
397
398 u32int word_count;
399 u32int word_buckets;
400 u32int soundslike_count;
401
402 u32int dict_name_size;
403 u32int lang_name_size;
404 u32int soundslike_name_size;
405 u32int soundslike_version_size;
406
407 u32int first_word_offset; // from word block
408
409 byte affix_info; // 0 = none, 1 = partially expanded, 2 = full
410 byte invisible_soundslike;
411 byte soundslike_root_only;
412 byte compound_info; //
413 byte freq_info;
414 };
415
load(ParmString f0,Config & config,DictList *,SpellerImpl *)416 PosibErr<void> ReadOnlyDict::load(ParmString f0, Config & config,
417 DictList *, SpellerImpl *)
418 {
419 set_file_name(f0);
420 const char * fn = file_name();
421
422 FStream f;
423 RET_ON_ERR(f.open(fn, "rb"));
424
425 DataHead data_head;
426
427 f.read(&data_head, sizeof(DataHead));
428
429 #if 0
430 COUT << "Head Size: " << data_head.head_size << "\n";
431 COUT << "Data Block Size: " << data_head.data_block_size << "\n";
432 COUT << "Hash Block Size: " << data_head.hash_block_size << "\n";
433 COUT << "Total Block Size: " << data_head.total_block_size << "\n";
434 #endif
435
436 if (strcmp(data_head.check_word, cur_check_word) != 0)
437 return make_err(bad_file_format, fn);
438
439 if (data_head.endian_check != 12345678)
440 return make_err(bad_file_format, fn, _("Wrong endian order."));
441
442 CharVector word;
443
444 word.resize(data_head.dict_name_size);
445 f.read(word.data(), data_head.dict_name_size);
446
447 word.resize(data_head.lang_name_size);
448 f.read(word.data(), data_head.lang_name_size);
449
450 PosibErr<void> pe = set_check_lang(word.data(),config);
451 if (pe.has_err()) {
452 if (pe.prvw_err()->is_a(language_related_error))
453 return pe.with_file(fn);
454 else
455 return pe;
456 }
457
458 if (data_head.soundslike_name_size != 0) {
459 word.resize(data_head.soundslike_name_size);
460 f.read(word.data(), data_head.soundslike_name_size);
461
462 if (strcmp(word.data(), lang()->soundslike_name()) != 0)
463 return make_err(bad_file_format, fn, _("Wrong soundslike."));
464
465 word.resize(data_head.soundslike_version_size);
466 f.read(word.data(), data_head.soundslike_version_size);
467
468 if (strcmp(word.data(), lang()->soundslike_version()) != 0)
469 return make_err(bad_file_format, fn, _("Wrong soundslike version."));
470 }
471
472 invisible_soundslike = data_head.invisible_soundslike;
473 soundslike_root_only = data_head.soundslike_root_only;
474
475 affix_compressed = data_head.affix_info;
476
477 block_size = data_head.block_size;
478 int offset = data_head.head_size;
479 mmaped_block = mmap_open(block_size + offset, f, 0);
480 if( mmaped_block != (char *)MAP_FAILED) {
481 block = mmaped_block + offset;
482 mmaped_size = block_size + offset;
483 } else {
484 mmaped_block = 0;
485 block = (char *)malloc(block_size);
486 f.seek(data_head.head_size);
487 f.read(block, block_size);
488 }
489
490 if (data_head.jump2_offset) {
491 fast_scan = true;
492 jump1 = reinterpret_cast<const Jump *>(block + data_head.jump1_offset);
493 jump2 = reinterpret_cast<const Jump *>(block + data_head.jump2_offset);
494 } else {
495 jump1 = jump2 = 0;
496 }
497
498 word_block = block + data_head.word_offset;
499 first_word = word_block + data_head.first_word_offset;
500
501 word_lookup.parms().block_begin = word_block;
502 word_lookup.parms().hash .lang = lang();
503 word_lookup.parms().equal.cmp.lang = lang();
504 const u32int * begin = reinterpret_cast<const u32int *>
505 (block + data_head.hash_offset);
506 word_lookup.vector().set(begin, begin + data_head.word_buckets);
507 word_lookup.set_size(data_head.word_count);
508
509 //low_level_dump();
510 RET_ON_ERR(check_hash_fun());
511
512 return no_err;
513 }
514
515 void lookup_adv(WordEntry * wi);
516
prep_next(WordEntry * wi,const char * w,const SensitiveCompare * c,const char * orig)517 static inline void prep_next(WordEntry * wi,
518 const char * w,
519 const SensitiveCompare * c,
520 const char * orig)
521 {
522 loop:
523 if (!duplicate_flag(w)) return;
524 w = get_next(w);
525 if (!(*c)(orig, w)) goto loop;
526 wi->intr[0] = (void *)w;
527 wi->intr[1] = (void *)c;
528 wi->intr[2] = (void *)orig;
529 wi->adv_ = lookup_adv;
530 }
531
lookup_adv(WordEntry * wi)532 void lookup_adv(WordEntry * wi)
533 {
534 const char * w = (const char *)wi->intr[0];
535 const SensitiveCompare * c = (const SensitiveCompare *)wi->intr[1];
536 const char * orig = (const char *)wi->intr[2];
537 convert(w,*wi);
538 wi->adv_ = 0;
539 prep_next(wi, w, c, orig);
540 }
541
lookup(ParmString word,const SensitiveCompare * c,WordEntry & o) const542 bool ReadOnlyDict::lookup(ParmString word, const SensitiveCompare * c,
543 WordEntry & o) const
544 {
545 o.clear();
546 WordLookup::const_iterator i = word_lookup.find(word);
547 if (i == word_lookup.end()) return false;
548 const char * w = word_block + *i;
549 for (;;) {
550 if ((*c)(word, w)) {
551 convert(w,o);
552 prep_next(&o, w, c, word);
553 return true;
554 }
555 if (!duplicate_flag(w)) break;
556 w = get_next(w);
557 }
558 return false;
559 }
560
561 struct ReadOnlyDict::SoundslikeElements : public SoundslikeEnumeration
562 {
563 WordEntry data;
564 const ReadOnlyDict * obj;
565 const Jump * jump1;
566 const Jump * jump2;
567 const char * cur;
568 const char * prev;
569 int level;
570 bool invisible_soundslike;
571
572 WordEntry * next(int stopped_at);
573
SoundslikeElements__anon395da9830111::ReadOnlyDict::SoundslikeElements574 SoundslikeElements(const ReadOnlyDict * o)
575 : obj(o), jump1(obj->jump1), jump2(obj->jump2), cur(0),
576 level(1), invisible_soundslike(o->invisible_soundslike) {
577 data.what = o->invisible_soundslike ? WordEntry::Word : WordEntry::Soundslike;}
578 };
579
next(int stopped_at)580 WordEntry * ReadOnlyDict::SoundslikeElements::next(int stopped_at) {
581
582 //CERR << level << ":" << stopped_at << " :";
583 //CERR << jump1->sl << ":" << jump2->sl << "\n";
584
585 loop:
586
587 const char * tmp = cur;
588 const char * p;
589
590 if (level == 1 && stopped_at < 2) {
591
592 ++jump1;
593 tmp = jump1->sl;
594 goto jquit;
595
596 } else if (level == 2 && stopped_at < 3) {
597
598 ++jump2;
599 if (jump2[-1].sl[1] != jump2[0].sl[1]) {
600 ++jump1;
601 level = 1;
602 tmp = jump1->sl;
603 } else {
604 tmp = jump2->sl;
605 }
606 goto jquit;
607
608 } else if (level == 1) {
609
610 level = 2;
611 jump2 = obj->jump2 + jump1->loc;
612 tmp = jump2->sl;
613 goto jquit;
614
615 } else if (level == 2) {
616
617 tmp = cur = obj->word_block + jump2->loc;
618 level = 3;
619
620 } else if (get_offset(cur) == 0) {
621
622 level = 2;
623 ++jump2;
624 if (jump2[-1].sl[1] != jump2[0].sl[1]) {
625 level = 1;
626 ++jump1;
627 tmp = jump1->sl;
628 } else {
629 tmp = jump2->sl;
630 }
631 goto jquit;
632
633 }
634
635 cur = get_next(cur); // this will be the NEXT item looked at
636
637 p = prev;
638 prev = tmp;
639 if (p) {
640 // PRECOND:
641 // unless stopped_at >= LARGE_NUM
642 // strlen(p) >= stopped_at
643 // (stopped_at >= 3) implies
644 // strncmp(p, tmp, 3) == 0 if !invisible_soundslike
645 // strncmp(to_sl(p), to_sl(tmp), 3) == 0 if invisible_soundslike
646 if (stopped_at == 3) {
647 if (p[3] == tmp[3]) goto loop;
648 } else if (stopped_at == 4) {
649 if (p[3] == tmp[3] && tmp[3] &&
650 p[4] == tmp[4]) goto loop;
651 } else if (stopped_at == 5) {
652 if (p[3] == tmp[3] && tmp[3] &&
653 p[4] == tmp[4] && tmp[4] &&
654 p[5] == tmp[5]) goto loop;
655 }
656 }
657
658 data.word = tmp;
659 data.word_size = get_word_size(tmp);
660 if (invisible_soundslike) {
661 convert(tmp, data);
662 }
663 data.intr[0] = (void *)tmp;
664
665 return &data;
666
667 jquit:
668 prev = 0;
669 if (!*tmp) return 0;
670 data.word = tmp;
671 data.word_size = !tmp[1] ? 1 : !tmp[2] ? 2 : 3;
672 data.intr[0] = 0;
673 if (invisible_soundslike) {
674 data.what = WordEntry::Clean;
675 data.aff = 0;
676 }
677 return &data;
678 }
679
soundslike_elements() const680 SoundslikeEnumeration * ReadOnlyDict::soundslike_elements() const {
681
682 return new SoundslikeElements(this);
683
684 }
685
soundslike_next(WordEntry * w)686 static void soundslike_next(WordEntry * w)
687 {
688 const char * cur = (const char *)(w->intr[0]);
689 const char * end = (const char *)(w->intr[1]);
690 convert(cur, *w);
691 cur = get_next(cur);
692 w->intr[0] = (void *)cur;
693 if (cur >= end) w->adv_ = 0;
694 }
695
clean_lookup_adv(WordEntry * wi)696 static void clean_lookup_adv(WordEntry * wi)
697 {
698 const char * w = wi->word;
699 w = get_next(w);
700 convert(w,*wi);
701 if (!duplicate_flag(w)) wi->adv_ = 0;
702 }
703
clean_lookup(ParmString sl,WordEntry & o) const704 bool ReadOnlyDict::clean_lookup(ParmString sl, WordEntry & o) const
705 {
706 o.clear();
707 WordLookup::const_iterator i = word_lookup.find(sl);
708 if (i == word_lookup.end()) return false;
709 const char * w = word_block + *i;
710 convert(w, o);
711 if (duplicate_flag(w)) o.adv_ = clean_lookup_adv;
712 return true;
713 }
714
soundslike_lookup(const WordEntry & s,WordEntry & w) const715 bool ReadOnlyDict::soundslike_lookup(const WordEntry & s, WordEntry & w) const
716 {
717 if (s.intr[0] == 0) {
718
719 return false;
720
721 } else if (!invisible_soundslike) {
722
723 w.clear();
724 w.what = WordEntry::Word;
725 w.intr[0] = (void *)get_sl_words_begin(s.word);
726 w.intr[1] = (void *)get_sl_words_end(s.word);
727 w.adv_ = soundslike_next;
728 soundslike_next(&w);
729 return true;
730
731 } else {
732
733 w.clear();
734 w.what = WordEntry::Word;
735 convert(s.word, w);
736 return true;
737
738 }
739 }
740
soundslike_lookup(ParmString s,WordEntry & w) const741 bool ReadOnlyDict::soundslike_lookup(ParmString s, WordEntry & w) const
742 {
743 if (invisible_soundslike) {
744 return ReadOnlyDict::clean_lookup(s,w);
745 } else {
746 return false;
747 }
748 }
749
750 }
751
752 namespace aspeller {
753
new_default_readonly_dict()754 Dictionary * new_default_readonly_dict() {
755 return new ReadOnlyDict();
756 }
757
758 }
759
760 namespace {
761
762 // Possible:
763 // No Affix Compression:
764 // no soundslike
765 // invisible soundslike
766 // with soundslike
767 // Affix Compression:
768 // group by root:
769 // no soundslike
770 // invisible soundslike
771 // with soundslike
772 // expand prefix:
773 // no soundslike
774 // invisible soundslike
775
776 using namespace aspeller;
777
778 struct WordData {
779 static const unsigned struct_size;
780 WordData * next;
781 char * sl;
782 char * aff;
783 byte word_size;
784 byte sl_size;
785 byte data_size;
786 byte flags;
787 char word[1];
788 };
789
790 const unsigned WordData::struct_size = sizeof(WordData) - 1;
791
792
793 struct SoundslikeLess {
794 InsensitiveCompare icomp;
SoundslikeLess__anon395da9830211::SoundslikeLess795 SoundslikeLess(const Language * l) : icomp(l) {}
operator ()__anon395da9830211::SoundslikeLess796 bool operator() (WordData * x, WordData * y) const {
797 int res = strcmp(x->sl, y->sl);
798 if (res != 0) return res < 0;
799 res = icomp(x->word, y->word);
800 if (res != 0) return res < 0;
801 return strcmp(x->word, y->word) < 0;
802 }
803 };
804
805 struct WordLookupParms {
806 const char * block_begin;
WordLookupParms__anon395da9830211::WordLookupParms807 WordLookupParms() {}
808 typedef acommon::Vector<u32int> Vector;
809 typedef u32int Value;
810 typedef const char * Key;
811 static const bool is_multi = false;
key__anon395da9830211::WordLookupParms812 Key key(Value v) const {return block_begin + v;}
813 InsensitiveHash<hash_int_t> hash;
814 InsensitiveEqual equal;
is_nonexistent__anon395da9830211::WordLookupParms815 bool is_nonexistent(Value v) const {return v == u32int_max;}
make_nonexistent__anon395da9830211::WordLookupParms816 void make_nonexistent(Value & v) const {v = u32int_max;}
817 };
818 typedef VectorHashTable<WordLookupParms> WordLookup;
819
round_up(unsigned int i,unsigned int size)820 static inline unsigned int round_up(unsigned int i, unsigned int size) {
821 return ((i + size - 1)/size)*size;
822 }
823
advance_file(FStream & out,int pos)824 static void advance_file(FStream & out, int pos) {
825 int diff = pos - out.tell();
826 assert(diff >= 0);
827 for(; diff != 0; --diff)
828 out << '\0';
829 }
830
create(StringEnumeration * els,const Language & lang,Config & config)831 PosibErr<void> create (StringEnumeration * els,
832 const Language & lang,
833 Config & config)
834 {
835 assert(sizeof(u16int) == 2);
836 assert(sizeof(u32int) == 4);
837
838 bool full_soundslike = !(strcmp(lang.soundslike_name(), "none") == 0 ||
839 strcmp(lang.soundslike_name(), "stripped") == 0 ||
840 strcmp(lang.soundslike_name(), "simple") == 0);
841
842 bool affix_compress = (lang.affix() &&
843 config.retrieve_bool("affix-compress"));
844
845 bool partially_expand = (affix_compress &&
846 !full_soundslike &&
847 config.retrieve_bool("partially-expand"));
848
849 bool invisible_soundslike = false;
850 if (partially_expand)
851 invisible_soundslike = true;
852 else if (config.have("invisible-soundslike"))
853 invisible_soundslike = config.retrieve_bool("invisible-soundslike");
854 else if (!full_soundslike)
855 invisible_soundslike = true;
856
857 ConvEC iconv;
858 if (!config.have("norm-strict"))
859 config.replace("norm-strict", "true");
860 if (config.have("encoding")) {
861 String enc = config.retrieve("encoding");
862 RET_ON_ERR(iconv.setup(config, enc, lang.charmap(), NormFrom));
863 } else {
864 RET_ON_ERR(iconv.setup(config, lang.data_encoding(), lang.charmap(), NormFrom));
865 }
866
867 String base = config.retrieve("master-path");
868
869 DataHead data_head;
870 memset(&data_head, 0, sizeof(data_head));
871 strcpy(data_head.check_word, cur_check_word);
872
873 data_head.endian_check = 12345678;
874
875 data_head.dict_name_size = 1;
876 data_head.lang_name_size = strlen(lang.name()) + 1;
877 data_head.soundslike_name_size = strlen(lang.soundslike_name()) + 1;
878 data_head.soundslike_version_size = strlen(lang.soundslike_version()) + 1;
879 data_head.head_size = sizeof(DataHead);
880 data_head.head_size += data_head.dict_name_size;
881 data_head.head_size += data_head.lang_name_size;
882 data_head.head_size += data_head.soundslike_name_size;
883 data_head.head_size += data_head.soundslike_version_size;
884 data_head.head_size = round_up(data_head.head_size, DataHead::align);
885
886 data_head.affix_info = affix_compress ? partially_expand ? 1 : 2 : 0;
887 data_head.invisible_soundslike = invisible_soundslike;
888 data_head.soundslike_root_only = affix_compress && !partially_expand ? 1 : 0;
889
890 #if 0
891 CERR.printl("FLAGS: ");
892 if (full_soundslike) CERR.printl(" full soundslike");
893 if (invisible_soundslike) CERR.printl(" invisible soundslike");
894 if (data_head.soundslike_root_only) CERR.printl(" soundslike root only");
895 if (affix_compress) CERR.printl(" affix compress");
896 if (partially_expand) CERR.printl(" partially expand");
897 CERR.printl("---");
898 #endif
899
900 String temp;
901
902 int num_entries = 0;
903 int uniq_entries = 0;
904
905 ObjStack buf(16*1024);
906 String sl_buf;
907
908 WordData * first = 0;
909
910 //
911 // Read in Wordlist
912 //
913 {
914 WordListIterator wl_itr(els, &lang, config.retrieve_bool("warn") ? &CERR : 0);
915 wl_itr.init(config);
916 ObjStack exp_buf;
917 WordAff * exp_list;
918 WordAff single;
919 single.next = 0;
920 Vector<WordAff> af_list;
921 WordData * * prev = &first;
922
923 for (;;) {
924
925 PosibErr<bool> pe = wl_itr.adv();
926 if (pe.has_err()) return pe;
927 if (!pe.data) break;
928
929 const char * w = wl_itr->word.str;
930 unsigned int s = wl_itr->word.size;
931
932 const char * affixes = wl_itr->aff.str;
933
934 if (*affixes && !lang.affix())
935 return make_err(other_error,
936 _("Affix flags found in word but no affix file given."));
937
938 if (*affixes && !affix_compress) {
939 exp_buf.reset();
940 exp_list = lang.affix()->expand(w, affixes, exp_buf);
941 } else if (*affixes && partially_expand) {
942 // expand any affixes which will effect the first
943 // 3 letters of a word. This is needed so that the
944 // jump tables will function correctly
945 exp_buf.reset();
946 exp_list = lang.affix()->expand(w, affixes, exp_buf, 3);
947 } else {
948 single.word.str = w;
949 single.word.size = strlen(w);
950 single.aff = (const byte *)affixes;
951 exp_list = &single;
952 }
953
954 // iterate through each expanded word
955
956 for (WordAff * p = exp_list; p; p = p->next)
957 {
958 const char * w = p->word.str;
959 s = p->word.size;
960
961 unsigned total_size = WordData::struct_size;
962 unsigned data_size = s + 1;
963 unsigned aff_size = strlen((const char *)p->aff);
964 if (aff_size > 0) data_size += aff_size + 1;
965 total_size += data_size;
966 lang.to_soundslike(sl_buf, w);
967 const char * sl = sl_buf.str();
968 unsigned sl_size = sl_buf.size();
969 if (strcmp(sl,w) == 0) sl = w;
970 if (sl != w) total_size += sl_size + 1;
971
972 if (total_size - WordData::struct_size > 240)
973 return make_err(invalid_word, MsgConv(lang)(w),
974 _("The total word length, with soundslike data, is larger than 240 characters."));
975
976 WordData * b = (WordData *)buf.alloc(total_size, sizeof(void *));
977 *prev = b;
978 b->next = 0;
979 prev = &b->next;
980
981 b->word_size = s;
982 b->sl_size = strlen(sl);
983 b->data_size = data_size;
984 b->flags = lang.get_word_info(w);
985
986 char * z = b->word;
987
988 memcpy(z, w, s + 1);
989 z += s + 1;
990
991 if (aff_size > 0) {
992 b->flags |= HAVE_AFFIX_FLAG;
993 b->aff = z;
994 memcpy(z, p->aff, aff_size + 1);
995 z += aff_size + 1;
996 } else {
997 b->aff = 0;
998 }
999
1000 if (sl != w) {
1001 memcpy(z, sl, sl_size + 1);
1002 b->sl = z;
1003 } else {
1004 b->sl = b->word;
1005 }
1006
1007 }
1008 }
1009 delete els;
1010 }
1011
1012 //
1013 // sort WordData linked list based on (sl, word)
1014 //
1015
1016 first = sort(first, SoundslikeLess(&lang));
1017
1018 //
1019 // duplicate check
1020 //
1021 WordData * prev = first;
1022 WordData * cur = first ? first->next : 0;
1023 InsensitiveEqual ieq(&lang);
1024 while (cur) {
1025 if (strcmp(prev->word, cur->word) == 0) {
1026 // merge affix info if necessary
1027 if (!prev->aff && cur->aff) {
1028 prev->flags |= HAVE_AFFIX_FLAG;
1029 prev->aff = cur->aff;
1030 prev->data_size += strlen(prev->aff) + 1;
1031 } else if (prev->aff && cur->aff) {
1032 unsigned l1 = strlen(prev->aff);
1033 unsigned l2 = strlen(cur->aff);
1034 char * aff = (char *)buf.alloc(l1 + l2 + 1);
1035 memcpy(aff, prev->aff, l1);
1036 prev->aff = aff;
1037 aff += l1;
1038 for (const char * p = cur->aff; *p; ++p) {
1039 if (memchr(prev->aff, *p, l1)) continue;
1040 *aff = *p;
1041 ++aff;
1042 }
1043 *aff = '\0';
1044 prev->data_size = prev->word_size + (aff - prev->aff) + 2;
1045 }
1046 prev->next = cur->next;
1047 } else {
1048 if (ieq(prev->word, cur->word)) prev->flags |= DUPLICATE_FLAG;
1049 else ++uniq_entries;
1050 ++num_entries;
1051 prev = cur;
1052 }
1053 cur = cur->next;
1054 }
1055
1056 //
1057 //
1058 //
1059
1060 unsigned data_size = 16;
1061 WordData * p = first;
1062 if (invisible_soundslike) {
1063
1064 for (; p; p = p->next)
1065 data_size += 3 + p->data_size;
1066
1067 } else {
1068
1069 while (p)
1070 {
1071 unsigned ds = 2 + p->sl_size + 1;
1072
1073 char * prev = p->sl;
1074
1075 do {
1076
1077 ds += 3 + p->data_size;
1078 p->sl = prev;
1079 p = p->next;
1080
1081 } while (p && strcmp(prev, p->sl) == 0 && ds + 3 + p->data_size < 255);
1082
1083 data_size += ds;
1084
1085 }
1086
1087 }
1088
1089 //
1090 // Create the final data structures
1091 //
1092
1093 CharVector data;
1094 data.reserve(data_size);
1095 data.write32(0); // to avoid nasty special cases
1096 unsigned int prev_pos = data.size();
1097 data.write32(0);
1098 unsigned prev_w_pos = data.size();
1099
1100 WordLookup lookup(affix_compress
1101 ? uniq_entries * 3 / 2
1102 : uniq_entries * 5 / 4);
1103 lookup.parms().block_begin = data.begin();
1104 lookup.parms().hash .lang = ⟨
1105 lookup.parms().equal.cmp.lang = ⟨
1106
1107 Vector<Jump> jump1;
1108 Vector<Jump> jump2;
1109
1110 const int head_size = invisible_soundslike ? 3 : 2;
1111
1112 const char * prev_sl = "";
1113 p = first;
1114 while (p)
1115 {
1116 if (invisible_soundslike) {
1117
1118 data.write(p->flags); // flags
1119 data.write('\0'); // place holder for offset to next item
1120 data.write(p->word_size);
1121
1122 } else {
1123
1124 data.write('\0'); // place holder for offset to next item
1125 data.write(p->sl_size);
1126
1127 }
1128
1129 if (strncmp(prev_sl, p->sl, 3) != 0) {
1130
1131 Jump jump;
1132 strncpy(jump.sl, p->sl, 3);
1133 jump.loc = data.size();
1134 jump2.push_back(jump);
1135
1136 if (strncmp(prev_sl, p->sl, 2) != 0) {
1137 Jump jump;
1138 strncpy(jump.sl, p->sl, 2);
1139 jump.loc = jump2.size() - 1;
1140 jump1.push_back(jump);
1141 }
1142
1143 data[prev_pos - NEXT_O] = (byte)(data.size() - prev_pos - head_size + 1);
1144 // when advanced to this position the offset byte will
1145 // be null (since it will point to the null terminator
1146 // of the last word) and will thus signal the end of the
1147 // group
1148
1149 } else {
1150
1151 data[prev_pos - NEXT_O] = (byte)(data.size() - prev_pos);
1152
1153 }
1154
1155 prev_pos = data.size();
1156 prev_sl = p->sl;
1157
1158 if (invisible_soundslike) {
1159
1160 unsigned pos = data.size();
1161 prev_w_pos = data.size();
1162 data.write(p->word, p->word_size + 1);
1163 if (p->aff) data.write(p->aff, p->data_size - p->word_size - 1);
1164 lookup.insert(pos);
1165
1166 p = p->next;
1167
1168 } else {
1169
1170 data.write(p->sl, p->sl_size + 1);
1171
1172 // write all word entries with the same soundslike
1173
1174 do {
1175 data.write(p->flags);
1176 data.write(p->data_size + 3);
1177 data.write(p->word_size);
1178
1179 unsigned pos = data.size();
1180 data[prev_w_pos - NEXT_O] = (byte)(pos - prev_w_pos);
1181 data.write(p->word, p->word_size + 1);
1182 if (p->aff) data.write(p->aff, p->data_size - p->word_size - 1);
1183 lookup.insert(pos);
1184
1185 prev_w_pos = pos;
1186 prev_sl = p->sl;
1187
1188 p = p->next;
1189
1190 } while (p && prev_sl == p->sl); // yes I really mean to use pointer compare here
1191 }
1192 }
1193
1194 // add special end case
1195 if (data.size() % 2 != 0) data.write('\0');
1196 data.write16(0);
1197 data.write16(0);
1198 data[prev_pos - NEXT_O] |= (byte)(data.size() - prev_pos);
1199
1200 jump2.push_back(Jump());
1201 jump1.push_back(Jump());
1202
1203 data.write(0);
1204 data.write(0);
1205 data.write(0);
1206
1207 if (invisible_soundslike)
1208 data_head.first_word_offset = data[4 - NEXT_O] + 4;
1209 else
1210 data_head.first_word_offset = data[8 - NEXT_O] + 8;
1211
1212 memset(data.data(), 0, 8);
1213
1214 //CERR.printf("%d == %d\n", lookup.size(), uniq_entries);
1215 //assert(lookup.size() == uniq_entries);
1216
1217 data_head.word_count = num_entries;
1218 data_head.word_buckets = lookup.bucket_count();
1219
1220 FStream out;
1221 out.open(base, "wb");
1222
1223 advance_file(out, data_head.head_size);
1224
1225 // Write jump1 table
1226 data_head.jump1_offset = out.tell() - data_head.head_size;
1227 out.write(jump1.data(), jump1.size() * sizeof(Jump));
1228
1229 // Write jump2 table
1230 advance_file(out, round_up(out.tell(), DataHead::align));
1231 data_head.jump2_offset = out.tell() - data_head.head_size;
1232 out.write(jump2.data(), jump2.size() * sizeof(Jump));
1233
1234 // Write data block
1235 advance_file(out, round_up(out.tell(), DataHead::align));
1236 data_head.word_offset = out.tell() - data_head.head_size;
1237 out.write(data.data(), data.size());
1238
1239 // Write hash
1240 advance_file(out, round_up(out.tell(), DataHead::align));
1241 data_head.hash_offset = out.tell() - data_head.head_size;
1242 out.write(&lookup.vector().front(), lookup.vector().size() * 4);
1243
1244 // calculate block size
1245 advance_file(out, round_up(out.tell(), DataHead::align));
1246 data_head.block_size = out.tell() - data_head.head_size;
1247
1248 // write data head to file
1249 out.seek(0);
1250 out.write(&data_head, sizeof(DataHead));
1251 out.write(" ", 1);
1252 out.write(lang.name(), data_head.lang_name_size);
1253 out.write(lang.soundslike_name(), data_head.soundslike_name_size);
1254 out.write(lang.soundslike_version(), data_head.soundslike_version_size);
1255
1256 return no_err;
1257 }
1258
1259 }
1260
1261 namespace aspeller {
create_default_readonly_dict(StringEnumeration * els,Config & config)1262 PosibErr<void> create_default_readonly_dict(StringEnumeration * els,
1263 Config & config)
1264 {
1265 CachePtr<Language> lang;
1266 PosibErr<Language *> res = new_language(config);
1267 if (res.has_err()) return res;
1268 lang.reset(res.data);
1269 lang->set_lang_defaults(config);
1270 RET_ON_ERR(create(els,*lang,config));
1271 return no_err;
1272 }
1273 }
1274
1275