1 // Copyright 2004 by Kevin Atkinson under the terms of the LGPL
2 
3 #ifndef ASPELLER_LANGUAGE__HPP
4 #define ASPELLER_LANGUAGE__HPP
5 
6 #include "affix.hpp"
7 #include "cache.hpp"
8 #include "config.hpp"
9 #include "convert.hpp"
10 #include "phonetic.hpp"
11 #include "posib_err.hpp"
12 #include "stack_ptr.hpp"
13 #include "string.hpp"
14 #include "objstack.hpp"
15 #include "string_enumeration.hpp"
16 
17 #include "iostream.hpp"
18 
19 using namespace acommon;
20 
21 namespace acommon {
22   struct CheckInfo;
23 
24   struct ConfigConvKey : public ConvKey {
25     Config::Value config_val;
26     template <typename T>
ConfigConvKeyacommon::ConfigConvKey27     ConfigConvKey(const T & v) : config_val(v) {
28       val = config_val.val;
29       allow_ucs = config_val.secure;
30     }
operator =acommon::ConfigConvKey31     ConfigConvKey & operator=(const ConfigConvKey & other) {
32       config_val = other.config_val;
33       val = config_val.val;
34       allow_ucs = config_val.secure;
35       return *this;
36     }
fix_encoding_stracommon::ConfigConvKey37     void fix_encoding_str() {
38       String buf;
39       ::fix_encoding_str(val, buf);
40       config_val.val.swap(buf);
41       val = config_val.val;
42     }
43   private:
44   };
45 }
46 
47 namespace aspeller {
48 
49   struct SuggestRepl {
50     const char * substr;
51     const char * repl;
52   };
53 
54   class SuggestReplEnumeration
55   {
56     const SuggestRepl * i_;
57     const SuggestRepl * end_;
58   public:
SuggestReplEnumeration(const SuggestRepl * b,const SuggestRepl * e)59     SuggestReplEnumeration(const SuggestRepl * b, const SuggestRepl * e)
60       : i_(b), end_(e) {}
at_end() const61     bool at_end() const {return i_ == end_;}
next()62     const SuggestRepl * next() {
63       if (i_ == end_) return 0;
64       return i_++;
65     }
66   };
67 
68   // CharInfo
69 
70   typedef unsigned int CharInfo; // 6 bits
71 
72   static const CharInfo LOWER  = (1 << 0);
73   static const CharInfo UPPER  = (1 << 1);
74   static const CharInfo TITLE  = (1 << 2);
75   static const CharInfo PLAIN  = (1 << 3);
76   static const CharInfo LETTER = (1 << 4);
77   static const CharInfo CLEAN  = (1 << 5);
78 
79   static const CharInfo CHAR_INFO_ALL = 0x3F;
80 
81   //
82 
83   //
84 
85   struct CompoundWord {
86     const char * word;
87     const char * sep;
88     const char * rest;
89     const char * end;
emptyaspeller::CompoundWord90     bool empty() const {return word == end;}
singleaspeller::CompoundWord91     bool single() const {return rest == end;}
word_lenaspeller::CompoundWord92     unsigned word_len() const {return sep - word;}
rest_offsetaspeller::CompoundWord93     unsigned rest_offset() const {return rest - word;}
rest_lenaspeller::CompoundWord94     unsigned rest_len() const {return end - rest;}
CompoundWordaspeller::CompoundWord95     CompoundWord()
96       : word(), sep(), rest(), end() {}
CompoundWordaspeller::CompoundWord97     CompoundWord(const char * a, const char * b)
98       : word(a), sep(b), rest(b), end(b) {}
CompoundWordaspeller::CompoundWord99     CompoundWord(const char * a, const char * b, const char * c)
100       : word(a), sep(b), rest(b), end(c) {}
CompoundWordaspeller::CompoundWord101     CompoundWord(const char * a, const char * b, const char * c, const char * d)
102       : word(a), sep(b), rest(c), end(d) {}
103   };
104 
105   enum StoreAs {Stripped, Lower};
106 
107   class Language : public Cacheable {
108   public:
109     typedef const Config CacheConfig;
110     typedef String       CacheKey;
111 
112     enum CharType {Unknown, WhiteSpace, Hyphen, Digit,
113                    NonLetter, Modifier, Letter};
114 
115     struct SpecialChar {
116       bool begin;
117       bool middle;
118       bool end;
119       bool any;
SpecialCharaspeller::Language::SpecialChar120       SpecialChar() : begin(false), middle(false), end(false), any(false) {}
SpecialCharaspeller::Language::SpecialChar121       SpecialChar(bool b, bool m, bool e) : begin(b), middle(m), end(e),
122                                             any(b || m || e) {}
123     };
124 
125   private:
126     String   dir_;
127     String   name_;
128     String   charset_;
129     String   charmap_;
130     String   data_encoding_;
131 
132     ConvObj  mesg_conv_;
133     ConvObj  to_utf8_;
134     ConvObj  from_utf8_;
135 
to_uchar(char c) const136     unsigned char to_uchar(char c) const {return static_cast<unsigned char>(c);}
137 
138     SpecialChar special_[256];
139     CharInfo      char_info_[256];
140     char          to_lower_[256];
141     char          to_upper_[256];
142     char          to_title_[256];
143     char          to_stripped_[256];
144     char          to_plain_[256];
145     int           to_uni_[256];
146     CharType      char_type_[256];
147     char          to_clean_[256];
148     char          de_accent_[256];
149 
150     StoreAs       store_as_;
151 
152     String      soundslike_chars_;
153     String      clean_chars_;
154 
155     bool have_soundslike_;
156     bool have_repl_;
157 
158     StackPtr<Soundslike> soundslike_;
159     StackPtr<AffixMgr>   affix_;
160     StackPtr<Config>     lang_config_;
161 
162     StringBuffer buf_;
163     Vector<SuggestRepl> repls_;
164 
165     Language(const Language &);
166     void operator=(const Language &);
167 
168   public: // but don't use
169 
170     char          sl_first_[256];
171     char          sl_rest_[256];
172 
173   public:
174 
Language()175     Language() {}
176     PosibErr<void> setup(const String & lang, const Config * config);
177     PosibErr<void> set_lang_defaults(Config & config) const;
178 
data_dir() const179     const char * data_dir() const {return dir_.c_str();}
name() const180     const char * name() const {return name_.c_str();}
charmap() const181     const char * charmap() const {return charmap_.c_str();}
data_encoding() const182     const char * data_encoding() const {return data_encoding_.c_str();}
183 
mesg_conv() const184     const Convert * mesg_conv() const {return mesg_conv_.ptr;}
to_utf8() const185     const Convert * to_utf8() const {return to_utf8_.ptr;}
from_utf8() const186     const Convert * from_utf8() const {return from_utf8_.ptr;}
187 
to_uni(char c) const188     int to_uni(char c) const {return to_uni_[to_uchar(c)];}
189 
190     //
191     // case conversion
192     //
193 
to_upper(char c) const194     char to_upper(char c) const {return to_upper_[to_uchar(c)];}
is_upper(char c) const195     bool is_upper(char c) const {return to_upper(c) == c;}
196 
to_lower(char c) const197     char to_lower(char c) const {return to_lower_[to_uchar(c)];}
is_lower(char c) const198     bool is_lower(char c) const {return to_lower(c) == c;}
199 
to_title(char c) const200     char to_title(char c) const {return to_title_[to_uchar(c)];}
is_title(char c) const201     bool is_title(char c) const {return to_title(c) == c;}
202 
to_lower(char * res,const char * str) const203     char * to_lower(char * res, const char * str) const {
204       while (*str) *res++ = to_lower(*str++); *res = '\0'; return res;}
to_upper(char * res,const char * str) const205     char * to_upper(char * res, const char * str) const {
206       while (*str) *res++ = to_upper(*str++); *res = '\0'; return res;}
207 
to_lower(String & res,const char * str) const208     void to_lower(String & res, const char * str) const {
209       res.clear(); while (*str) res += to_lower(*str++);}
to_upper(String & res,const char * str) const210     void to_upper(String & res, const char * str) const {
211       res.clear(); while (*str) res += to_upper(*str++);}
212 
is_lower(const char * str) const213     bool is_lower(const char * str) const {
214       while (*str) {if (!is_lower(*str++)) return false;} return true;}
is_upper(const char * str) const215     bool is_upper(const char * str) const {
216       while (*str) {if (!is_upper(*str++)) return false;} return true;}
217 
218     //
219     //
220     //
221 
to_plain(char c) const222     char to_plain(char c) const {return to_plain_[to_uchar(c)];}
223 
de_accent(char c) const224     char de_accent(char c) const {return de_accent_[to_uchar(c)];}
225 
special(char c) const226     SpecialChar special(char c) const {return special_[to_uchar(c)];}
227 
char_type(char c) const228     CharType char_type(char c) const {return char_type_[to_uchar(c)];}
is_alpha(char c) const229     bool is_alpha(char c) const {return char_type(c) >  NonLetter;}
230 
char_info(char c) const231     CharInfo char_info(char c) const {return char_info_[to_uchar(c)];}
232 
233     //
234     // stripped
235     //
236 
to_stripped(char c) const237     char to_stripped(char c) const {return to_stripped_[to_uchar(c)];}
238 
239     // return a pointer to the END of the string
to_stripped(char * res,const char * str) const240     char * to_stripped(char * res, const char * str) const {
241       for (; *str; ++str) {
242         char c = to_stripped(*str);
243         if (c) *res++ = c;
244       }
245       *res = '\0';
246       return res;
247     }
to_stripped(String & res,const char * str) const248     void to_stripped(String & res, const char * str) const {
249       res.clear();
250       for (; *str; ++str) {
251         char c = to_stripped(*str);
252         if (c) res += c;
253       }
254     }
255 
is_stripped(char c) const256     bool is_stripped(char c) const {return to_stripped(c) == c;}
257 
is_stripped(const char * str) const258     bool is_stripped(const char * str) const {
259       while (*str) {if (!is_stripped(*str++)) return false;} return true;}
260 
261     //
262     // Clean
263     //
264     // The "clean" form is how words are indixed in the dictionary.
265     // It will at very least convert the word to lower case.  It may
266     // also strip accents and non-letters.
267     //
268 
to_clean(char c) const269     char to_clean(char c) const {return to_clean_[to_uchar(c)];}
270 
to_clean(char * res,const char * str) const271     char * to_clean(char * res, const char * str) const {
272       for (; *str; ++str) {
273         char c = to_clean(*str);
274         if (c) *res++ = c;
275       }
276       *res = '\0';
277       return res;
278     }
to_clean(String & res,const char * str) const279     void to_clean(String & res, const char * str) const {
280       res.clear();
281       for (; *str; ++str) {
282         char c = to_clean(*str);
283         if (c) res += c;
284       }
285     }
286 
is_clean(char c) const287     bool is_clean(char c) const {return to_clean(c) == c;}
288 
is_clean(const char * str) const289     bool is_clean(const char * str) const {
290       while (*str) {if (!is_clean(*str++)) return false;} return true;}
291 
is_clean_wi(WordInfo wi) const292     bool is_clean_wi(WordInfo wi) const {
293       return false;
294       //return wi & CASE_PATTEN == AllLower &&
295     }
296 
297 
clean_chars() const298     const char * clean_chars() const {return clean_chars_.c_str();}
299 
300     //
301     // Soundslike
302     //
303 
have_soundslike() const304     bool have_soundslike() const {return have_soundslike_;}
305 
soundslike_name() const306     const char * soundslike_name() const {return soundslike_->name();}
soundslike_version() const307     const char * soundslike_version() const {return soundslike_->version();}
308 
to_soundslike(String & res,ParmStr word) const309     void to_soundslike(String & res, ParmStr word) const {
310       res.resize(word.size());
311       char * e = soundslike_->to_soundslike(res.data(), word.str(), word.size());
312       res.resize(e - res.data());
313     }
314 
315     // returns a pointer to the END of the string
to_soundslike(char * res,const char * str,int len=-1) const316     char * to_soundslike(char * res, const char * str, int len = -1) const {
317       return soundslike_->to_soundslike(res,str,len);
318     }
319 
to_soundslike(char * res,const char * str,int len,WordInfo wi) const320     char * to_soundslike(char * res, const char * str, int len, WordInfo wi) const {
321       if (!have_soundslike_ && (wi & ALL_CLEAN)) return 0;
322       else return soundslike_->to_soundslike(res,str,len);
323     }
324 
soundslike_chars() const325     const char * soundslike_chars() const {return soundslike_chars_.c_str();}
326 
327     //
328     // Affix compression methods
329     //
330 
affix() const331     const AffixMgr * affix() const {return affix_;}
332 
have_affix() const333     bool have_affix() const {return affix_;}
334 
munch(ParmStr word,GuessInfo * cl,bool cross=true) const335     void munch(ParmStr word, GuessInfo * cl, bool cross = true) const {
336       if (affix_)
337         affix_->munch(word, cl, cross);
338     }
339 
expand(ParmStr word,ParmStr aff,ObjStack & buf,int limit=INT_MAX) const340     WordAff * expand(ParmStr word, ParmStr aff,
341                      ObjStack & buf, int limit = INT_MAX) const {
342       if (affix_)
343         return affix_->expand(word, aff, buf, limit);
344       else
345         return fake_expand(word, aff, buf);
346     }
347     WordAff * fake_expand(ParmStr word, ParmStr aff, ObjStack & buf) const;
348 
349     //
350     // Repl
351     //
352 
have_repl() const353     bool have_repl() const {return have_repl_;}
354 
repl() const355     SuggestReplEnumeration * repl() const {
356       return new SuggestReplEnumeration(repls_.pbegin(), repls_.pend());}
357 
358     //
359     //
360     //
361 
362     WordInfo get_word_info(ParmStr str) const;
363 
364     //
365     // fix_case
366     //
367 
368     CasePattern case_pattern(ParmStr str) const;
369 
370     CasePattern case_pattern(const char * str, unsigned size) const;
371 
fix_case(CasePattern case_pattern,char * str)372     void fix_case(CasePattern case_pattern, char * str)
373     {
374       if (!str[0]) return;
375       if (case_pattern == AllUpper) to_upper(str,str);
376       else if (case_pattern == FirstUpper) *str = to_title(*str);
377     }
378     void fix_case(CasePattern case_pattern,
379                   char * res, const char * str) const;
380     const char * fix_case(CasePattern case_pattern,
381                           const char * str, String & buf) const;
382 
383     //
384     //
385     //
386 
387     CompoundWord split_word(const char * str, unsigned size, bool camel_case) const;
388 
389     //
390     // for cache
391     //
392 
get_new(const String & lang,const Config * config)393     static inline PosibErr<Language *> get_new(const String & lang, const Config * config) {
394       StackPtr<Language> l(new Language());
395       RET_ON_ERR(l->setup(lang, config));
396       return l.release();
397     }
398 
cache_key_eq(const String & l) const399     bool cache_key_eq(const String & l) const  {return name_ == l;}
400   };
401 
402   typedef Language LangImpl;
403 
404   struct MsgConv : public ConvP
405   {
MsgConvaspeller::MsgConv406     MsgConv(const Language * l) : ConvP(l->mesg_conv()) {}
MsgConvaspeller::MsgConv407     MsgConv(const Language & l) : ConvP(l.mesg_conv()) {}
408   };
409 
410   struct InsensitiveCompare {
411     // compares to strings without regards to casing or special characters
412     const Language * lang;
InsensitiveCompareaspeller::InsensitiveCompare413     InsensitiveCompare(const Language * l = 0) : lang(l) {}
operator boolaspeller::InsensitiveCompare414     operator bool () const {return lang;}
operator ()aspeller::InsensitiveCompare415     int operator() (const char * a, const char * b) const
416     {
417       char x, y;
418       for (;;)
419       {
420         while (x = lang->to_clean(*a++), !x);
421         while (y = lang->to_clean(*b++), !y);
422         if (x == 0x10 || y == 0x10 || x != y) break;
423       }
424       return static_cast<unsigned char>(x) - static_cast<unsigned char>(y);
425     }
426   };
427 
428   struct InsensitiveEqual {
429     InsensitiveCompare cmp;
InsensitiveEqualaspeller::InsensitiveEqual430     InsensitiveEqual(const Language * l = 0) : cmp(l) {}
operator ()aspeller::InsensitiveEqual431     bool operator() (const char * a, const char * b) const
432     {
433       return cmp(a,b) == 0;
434     }
435   };
436 
437   template <typename HASH_INT = size_t>
438   struct InsensitiveHash {
439     // hashes a string without regards to casing or special begin
440     // or end characters
441     const Language * lang;
InsensitiveHashaspeller::InsensitiveHash442     InsensitiveHash() {}
InsensitiveHashaspeller::InsensitiveHash443     InsensitiveHash(const Language * l)
444 	: lang(l) {}
operator ()aspeller::InsensitiveHash445     HASH_INT operator() (const char * s) const
446     {
447       HASH_INT h = 0;
448       for (;;) {
449 	if (*s == 0) break;
450         unsigned char c = lang->to_clean(*s++);
451 	if (c) h=5*h + c;
452       }
453       return h;
454     }
455   };
456 
457   struct SensitiveCompare {
458     const Language * lang;
459     bool case_insensitive;
460     bool ignore_accents; // unused
461     bool begin; // if not begin we are checking the end of the word
462     bool end;   // if not end we are checking the beginning of the word
463                 // if both false we are checking the middle of a word
SensitiveCompareaspeller::SensitiveCompare464     SensitiveCompare(const Language * l = 0)
465       : lang(l), case_insensitive(false), ignore_accents(false),
466         begin(true), end(true) {}
467     bool operator() (const char * word, const char * inlist) const;
468   };
469 
470   struct CleanAffix {
471     const Language * lang;
472     OStream * log;
473     MsgConv msgconv1;
474     MsgConv msgconv2;
475     CleanAffix(const Language * lang0, OStream * log0);
476     char * operator() (ParmStr word, char * aff);
477   };
478 
479   class WordListIterator
480   {
481   public:
482     struct Value {
483       SimpleString word;
484       SimpleString aff;
485     };
486     WordListIterator(StringEnumeration * in,
487                      const Language * lang,
488                      OStream * log);
489     // init may set "norm-strict" to true which is why it is not const
490     PosibErr<void> init (Config & config);
491     // init_plain initialized the iterator to read in a plain word
492     // list without any affix flags, for simplicity it will expect the
493     // input to be utf-8.  It will also assume clean the words unless
494     // the `clean-words` option is explicitly specified.  Like init it
495     // may set "norm-strict" to true which is why it is not const
496     PosibErr<void> init_plain (Config & config);
operator *() const497     const Value & operator*() const {return val;}
operator ->() const498     const Value * operator-> () const {return &val;}
499     PosibErr<bool> adv();
500   private:
501     bool have_affix;
502     bool validate_words;
503     bool validate_affixes;
504     bool clean_words;
505     bool skip_invalid_words;
506     bool clean_affixes;
507     StringEnumeration * in;
508     const Language * lang;
509     ConvEC iconv;
510     OStream * log;
511     Value val;
512     String data;
513     const char * orig;
514     char * str;
515     char * str_end;
516     CleanAffix clean_affix;
517   };
518 
519   String get_stripped_chars(const Language & l);
520 
521   String get_clean_chars(const Language & l);
522 
523   PosibErr<void> check_if_sane(const Language & l, ParmStr word);
524   PosibErr<void> check_if_valid(const Language & l, ParmStr word);
525   PosibErr<void> validate_affix(const Language & l, ParmStr word, ParmStr aff);
526 
527   bool find_language(Config & c);
528 
529   PosibErr<Language *> new_language(const Config &, ParmStr lang = 0);
530 
531   PosibErr<void> open_affix_file(const Config &, FStream & o);
532 }
533 
534 
535 #endif
536