1 // Copyright 2004 by Kevin Atkinson under the terms of the LGPL 2 3 #ifndef ASPELLER_LANGUAGE__HPP 4 #define ASPELLER_LANGUAGE__HPP 5 6 #include "affix.hpp" 7 #include "cache.hpp" 8 #include "config.hpp" 9 #include "convert.hpp" 10 #include "phonetic.hpp" 11 #include "posib_err.hpp" 12 #include "stack_ptr.hpp" 13 #include "string.hpp" 14 #include "objstack.hpp" 15 #include "string_enumeration.hpp" 16 17 #include "iostream.hpp" 18 19 using namespace acommon; 20 21 namespace acommon { 22 struct CheckInfo; 23 24 struct ConfigConvKey : public ConvKey { 25 Config::Value config_val; 26 template <typename T> ConfigConvKeyacommon::ConfigConvKey27 ConfigConvKey(const T & v) : config_val(v) { 28 val = config_val.val; 29 allow_ucs = config_val.secure; 30 } operator =acommon::ConfigConvKey31 ConfigConvKey & operator=(const ConfigConvKey & other) { 32 config_val = other.config_val; 33 val = config_val.val; 34 allow_ucs = config_val.secure; 35 return *this; 36 } fix_encoding_stracommon::ConfigConvKey37 void fix_encoding_str() { 38 String buf; 39 ::fix_encoding_str(val, buf); 40 config_val.val.swap(buf); 41 val = config_val.val; 42 } 43 private: 44 }; 45 } 46 47 namespace aspeller { 48 49 struct SuggestRepl { 50 const char * substr; 51 const char * repl; 52 }; 53 54 class SuggestReplEnumeration 55 { 56 const SuggestRepl * i_; 57 const SuggestRepl * end_; 58 public: SuggestReplEnumeration(const SuggestRepl * b,const SuggestRepl * e)59 SuggestReplEnumeration(const SuggestRepl * b, const SuggestRepl * e) 60 : i_(b), end_(e) {} at_end() const61 bool at_end() const {return i_ == end_;} next()62 const SuggestRepl * next() { 63 if (i_ == end_) return 0; 64 return i_++; 65 } 66 }; 67 68 // CharInfo 69 70 typedef unsigned int CharInfo; // 6 bits 71 72 static const CharInfo LOWER = (1 << 0); 73 static const CharInfo UPPER = (1 << 1); 74 static const CharInfo TITLE = (1 << 2); 75 static const CharInfo PLAIN = (1 << 3); 76 static const CharInfo LETTER = (1 << 4); 77 static const CharInfo CLEAN = (1 << 5); 78 79 static const CharInfo CHAR_INFO_ALL = 0x3F; 80 81 // 82 83 // 84 85 struct CompoundWord { 86 const char * word; 87 const char * sep; 88 const char * rest; 89 const char * end; emptyaspeller::CompoundWord90 bool empty() const {return word == end;} singleaspeller::CompoundWord91 bool single() const {return rest == end;} word_lenaspeller::CompoundWord92 unsigned word_len() const {return sep - word;} rest_offsetaspeller::CompoundWord93 unsigned rest_offset() const {return rest - word;} rest_lenaspeller::CompoundWord94 unsigned rest_len() const {return end - rest;} CompoundWordaspeller::CompoundWord95 CompoundWord() 96 : word(), sep(), rest(), end() {} CompoundWordaspeller::CompoundWord97 CompoundWord(const char * a, const char * b) 98 : word(a), sep(b), rest(b), end(b) {} CompoundWordaspeller::CompoundWord99 CompoundWord(const char * a, const char * b, const char * c) 100 : word(a), sep(b), rest(b), end(c) {} CompoundWordaspeller::CompoundWord101 CompoundWord(const char * a, const char * b, const char * c, const char * d) 102 : word(a), sep(b), rest(c), end(d) {} 103 }; 104 105 enum StoreAs {Stripped, Lower}; 106 107 class Language : public Cacheable { 108 public: 109 typedef const Config CacheConfig; 110 typedef String CacheKey; 111 112 enum CharType {Unknown, WhiteSpace, Hyphen, Digit, 113 NonLetter, Modifier, Letter}; 114 115 struct SpecialChar { 116 bool begin; 117 bool middle; 118 bool end; 119 bool any; SpecialCharaspeller::Language::SpecialChar120 SpecialChar() : begin(false), middle(false), end(false), any(false) {} SpecialCharaspeller::Language::SpecialChar121 SpecialChar(bool b, bool m, bool e) : begin(b), middle(m), end(e), 122 any(b || m || e) {} 123 }; 124 125 private: 126 String dir_; 127 String name_; 128 String charset_; 129 String charmap_; 130 String data_encoding_; 131 132 ConvObj mesg_conv_; 133 ConvObj to_utf8_; 134 ConvObj from_utf8_; 135 to_uchar(char c) const136 unsigned char to_uchar(char c) const {return static_cast<unsigned char>(c);} 137 138 SpecialChar special_[256]; 139 CharInfo char_info_[256]; 140 char to_lower_[256]; 141 char to_upper_[256]; 142 char to_title_[256]; 143 char to_stripped_[256]; 144 char to_plain_[256]; 145 int to_uni_[256]; 146 CharType char_type_[256]; 147 char to_clean_[256]; 148 char de_accent_[256]; 149 150 StoreAs store_as_; 151 152 String soundslike_chars_; 153 String clean_chars_; 154 155 bool have_soundslike_; 156 bool have_repl_; 157 158 StackPtr<Soundslike> soundslike_; 159 StackPtr<AffixMgr> affix_; 160 StackPtr<Config> lang_config_; 161 162 StringBuffer buf_; 163 Vector<SuggestRepl> repls_; 164 165 Language(const Language &); 166 void operator=(const Language &); 167 168 public: // but don't use 169 170 char sl_first_[256]; 171 char sl_rest_[256]; 172 173 public: 174 Language()175 Language() {} 176 PosibErr<void> setup(const String & lang, const Config * config); 177 PosibErr<void> set_lang_defaults(Config & config) const; 178 data_dir() const179 const char * data_dir() const {return dir_.c_str();} name() const180 const char * name() const {return name_.c_str();} charmap() const181 const char * charmap() const {return charmap_.c_str();} data_encoding() const182 const char * data_encoding() const {return data_encoding_.c_str();} 183 mesg_conv() const184 const Convert * mesg_conv() const {return mesg_conv_.ptr;} to_utf8() const185 const Convert * to_utf8() const {return to_utf8_.ptr;} from_utf8() const186 const Convert * from_utf8() const {return from_utf8_.ptr;} 187 to_uni(char c) const188 int to_uni(char c) const {return to_uni_[to_uchar(c)];} 189 190 // 191 // case conversion 192 // 193 to_upper(char c) const194 char to_upper(char c) const {return to_upper_[to_uchar(c)];} is_upper(char c) const195 bool is_upper(char c) const {return to_upper(c) == c;} 196 to_lower(char c) const197 char to_lower(char c) const {return to_lower_[to_uchar(c)];} is_lower(char c) const198 bool is_lower(char c) const {return to_lower(c) == c;} 199 to_title(char c) const200 char to_title(char c) const {return to_title_[to_uchar(c)];} is_title(char c) const201 bool is_title(char c) const {return to_title(c) == c;} 202 to_lower(char * res,const char * str) const203 char * to_lower(char * res, const char * str) const { 204 while (*str) *res++ = to_lower(*str++); *res = '\0'; return res;} to_upper(char * res,const char * str) const205 char * to_upper(char * res, const char * str) const { 206 while (*str) *res++ = to_upper(*str++); *res = '\0'; return res;} 207 to_lower(String & res,const char * str) const208 void to_lower(String & res, const char * str) const { 209 res.clear(); while (*str) res += to_lower(*str++);} to_upper(String & res,const char * str) const210 void to_upper(String & res, const char * str) const { 211 res.clear(); while (*str) res += to_upper(*str++);} 212 is_lower(const char * str) const213 bool is_lower(const char * str) const { 214 while (*str) {if (!is_lower(*str++)) return false;} return true;} is_upper(const char * str) const215 bool is_upper(const char * str) const { 216 while (*str) {if (!is_upper(*str++)) return false;} return true;} 217 218 // 219 // 220 // 221 to_plain(char c) const222 char to_plain(char c) const {return to_plain_[to_uchar(c)];} 223 de_accent(char c) const224 char de_accent(char c) const {return de_accent_[to_uchar(c)];} 225 special(char c) const226 SpecialChar special(char c) const {return special_[to_uchar(c)];} 227 char_type(char c) const228 CharType char_type(char c) const {return char_type_[to_uchar(c)];} is_alpha(char c) const229 bool is_alpha(char c) const {return char_type(c) > NonLetter;} 230 char_info(char c) const231 CharInfo char_info(char c) const {return char_info_[to_uchar(c)];} 232 233 // 234 // stripped 235 // 236 to_stripped(char c) const237 char to_stripped(char c) const {return to_stripped_[to_uchar(c)];} 238 239 // return a pointer to the END of the string to_stripped(char * res,const char * str) const240 char * to_stripped(char * res, const char * str) const { 241 for (; *str; ++str) { 242 char c = to_stripped(*str); 243 if (c) *res++ = c; 244 } 245 *res = '\0'; 246 return res; 247 } to_stripped(String & res,const char * str) const248 void to_stripped(String & res, const char * str) const { 249 res.clear(); 250 for (; *str; ++str) { 251 char c = to_stripped(*str); 252 if (c) res += c; 253 } 254 } 255 is_stripped(char c) const256 bool is_stripped(char c) const {return to_stripped(c) == c;} 257 is_stripped(const char * str) const258 bool is_stripped(const char * str) const { 259 while (*str) {if (!is_stripped(*str++)) return false;} return true;} 260 261 // 262 // Clean 263 // 264 // The "clean" form is how words are indixed in the dictionary. 265 // It will at very least convert the word to lower case. It may 266 // also strip accents and non-letters. 267 // 268 to_clean(char c) const269 char to_clean(char c) const {return to_clean_[to_uchar(c)];} 270 to_clean(char * res,const char * str) const271 char * to_clean(char * res, const char * str) const { 272 for (; *str; ++str) { 273 char c = to_clean(*str); 274 if (c) *res++ = c; 275 } 276 *res = '\0'; 277 return res; 278 } to_clean(String & res,const char * str) const279 void to_clean(String & res, const char * str) const { 280 res.clear(); 281 for (; *str; ++str) { 282 char c = to_clean(*str); 283 if (c) res += c; 284 } 285 } 286 is_clean(char c) const287 bool is_clean(char c) const {return to_clean(c) == c;} 288 is_clean(const char * str) const289 bool is_clean(const char * str) const { 290 while (*str) {if (!is_clean(*str++)) return false;} return true;} 291 is_clean_wi(WordInfo wi) const292 bool is_clean_wi(WordInfo wi) const { 293 return false; 294 //return wi & CASE_PATTEN == AllLower && 295 } 296 297 clean_chars() const298 const char * clean_chars() const {return clean_chars_.c_str();} 299 300 // 301 // Soundslike 302 // 303 have_soundslike() const304 bool have_soundslike() const {return have_soundslike_;} 305 soundslike_name() const306 const char * soundslike_name() const {return soundslike_->name();} soundslike_version() const307 const char * soundslike_version() const {return soundslike_->version();} 308 to_soundslike(String & res,ParmStr word) const309 void to_soundslike(String & res, ParmStr word) const { 310 res.resize(word.size()); 311 char * e = soundslike_->to_soundslike(res.data(), word.str(), word.size()); 312 res.resize(e - res.data()); 313 } 314 315 // returns a pointer to the END of the string to_soundslike(char * res,const char * str,int len=-1) const316 char * to_soundslike(char * res, const char * str, int len = -1) const { 317 return soundslike_->to_soundslike(res,str,len); 318 } 319 to_soundslike(char * res,const char * str,int len,WordInfo wi) const320 char * to_soundslike(char * res, const char * str, int len, WordInfo wi) const { 321 if (!have_soundslike_ && (wi & ALL_CLEAN)) return 0; 322 else return soundslike_->to_soundslike(res,str,len); 323 } 324 soundslike_chars() const325 const char * soundslike_chars() const {return soundslike_chars_.c_str();} 326 327 // 328 // Affix compression methods 329 // 330 affix() const331 const AffixMgr * affix() const {return affix_;} 332 have_affix() const333 bool have_affix() const {return affix_;} 334 munch(ParmStr word,GuessInfo * cl,bool cross=true) const335 void munch(ParmStr word, GuessInfo * cl, bool cross = true) const { 336 if (affix_) 337 affix_->munch(word, cl, cross); 338 } 339 expand(ParmStr word,ParmStr aff,ObjStack & buf,int limit=INT_MAX) const340 WordAff * expand(ParmStr word, ParmStr aff, 341 ObjStack & buf, int limit = INT_MAX) const { 342 if (affix_) 343 return affix_->expand(word, aff, buf, limit); 344 else 345 return fake_expand(word, aff, buf); 346 } 347 WordAff * fake_expand(ParmStr word, ParmStr aff, ObjStack & buf) const; 348 349 // 350 // Repl 351 // 352 have_repl() const353 bool have_repl() const {return have_repl_;} 354 repl() const355 SuggestReplEnumeration * repl() const { 356 return new SuggestReplEnumeration(repls_.pbegin(), repls_.pend());} 357 358 // 359 // 360 // 361 362 WordInfo get_word_info(ParmStr str) const; 363 364 // 365 // fix_case 366 // 367 368 CasePattern case_pattern(ParmStr str) const; 369 370 CasePattern case_pattern(const char * str, unsigned size) const; 371 fix_case(CasePattern case_pattern,char * str)372 void fix_case(CasePattern case_pattern, char * str) 373 { 374 if (!str[0]) return; 375 if (case_pattern == AllUpper) to_upper(str,str); 376 else if (case_pattern == FirstUpper) *str = to_title(*str); 377 } 378 void fix_case(CasePattern case_pattern, 379 char * res, const char * str) const; 380 const char * fix_case(CasePattern case_pattern, 381 const char * str, String & buf) const; 382 383 // 384 // 385 // 386 387 CompoundWord split_word(const char * str, unsigned size, bool camel_case) const; 388 389 // 390 // for cache 391 // 392 get_new(const String & lang,const Config * config)393 static inline PosibErr<Language *> get_new(const String & lang, const Config * config) { 394 StackPtr<Language> l(new Language()); 395 RET_ON_ERR(l->setup(lang, config)); 396 return l.release(); 397 } 398 cache_key_eq(const String & l) const399 bool cache_key_eq(const String & l) const {return name_ == l;} 400 }; 401 402 typedef Language LangImpl; 403 404 struct MsgConv : public ConvP 405 { MsgConvaspeller::MsgConv406 MsgConv(const Language * l) : ConvP(l->mesg_conv()) {} MsgConvaspeller::MsgConv407 MsgConv(const Language & l) : ConvP(l.mesg_conv()) {} 408 }; 409 410 struct InsensitiveCompare { 411 // compares to strings without regards to casing or special characters 412 const Language * lang; InsensitiveCompareaspeller::InsensitiveCompare413 InsensitiveCompare(const Language * l = 0) : lang(l) {} operator boolaspeller::InsensitiveCompare414 operator bool () const {return lang;} operator ()aspeller::InsensitiveCompare415 int operator() (const char * a, const char * b) const 416 { 417 char x, y; 418 for (;;) 419 { 420 while (x = lang->to_clean(*a++), !x); 421 while (y = lang->to_clean(*b++), !y); 422 if (x == 0x10 || y == 0x10 || x != y) break; 423 } 424 return static_cast<unsigned char>(x) - static_cast<unsigned char>(y); 425 } 426 }; 427 428 struct InsensitiveEqual { 429 InsensitiveCompare cmp; InsensitiveEqualaspeller::InsensitiveEqual430 InsensitiveEqual(const Language * l = 0) : cmp(l) {} operator ()aspeller::InsensitiveEqual431 bool operator() (const char * a, const char * b) const 432 { 433 return cmp(a,b) == 0; 434 } 435 }; 436 437 template <typename HASH_INT = size_t> 438 struct InsensitiveHash { 439 // hashes a string without regards to casing or special begin 440 // or end characters 441 const Language * lang; InsensitiveHashaspeller::InsensitiveHash442 InsensitiveHash() {} InsensitiveHashaspeller::InsensitiveHash443 InsensitiveHash(const Language * l) 444 : lang(l) {} operator ()aspeller::InsensitiveHash445 HASH_INT operator() (const char * s) const 446 { 447 HASH_INT h = 0; 448 for (;;) { 449 if (*s == 0) break; 450 unsigned char c = lang->to_clean(*s++); 451 if (c) h=5*h + c; 452 } 453 return h; 454 } 455 }; 456 457 struct SensitiveCompare { 458 const Language * lang; 459 bool case_insensitive; 460 bool ignore_accents; // unused 461 bool begin; // if not begin we are checking the end of the word 462 bool end; // if not end we are checking the beginning of the word 463 // if both false we are checking the middle of a word SensitiveCompareaspeller::SensitiveCompare464 SensitiveCompare(const Language * l = 0) 465 : lang(l), case_insensitive(false), ignore_accents(false), 466 begin(true), end(true) {} 467 bool operator() (const char * word, const char * inlist) const; 468 }; 469 470 struct CleanAffix { 471 const Language * lang; 472 OStream * log; 473 MsgConv msgconv1; 474 MsgConv msgconv2; 475 CleanAffix(const Language * lang0, OStream * log0); 476 char * operator() (ParmStr word, char * aff); 477 }; 478 479 class WordListIterator 480 { 481 public: 482 struct Value { 483 SimpleString word; 484 SimpleString aff; 485 }; 486 WordListIterator(StringEnumeration * in, 487 const Language * lang, 488 OStream * log); 489 // init may set "norm-strict" to true which is why it is not const 490 PosibErr<void> init (Config & config); 491 // init_plain initialized the iterator to read in a plain word 492 // list without any affix flags, for simplicity it will expect the 493 // input to be utf-8. It will also assume clean the words unless 494 // the `clean-words` option is explicitly specified. Like init it 495 // may set "norm-strict" to true which is why it is not const 496 PosibErr<void> init_plain (Config & config); operator *() const497 const Value & operator*() const {return val;} operator ->() const498 const Value * operator-> () const {return &val;} 499 PosibErr<bool> adv(); 500 private: 501 bool have_affix; 502 bool validate_words; 503 bool validate_affixes; 504 bool clean_words; 505 bool skip_invalid_words; 506 bool clean_affixes; 507 StringEnumeration * in; 508 const Language * lang; 509 ConvEC iconv; 510 OStream * log; 511 Value val; 512 String data; 513 const char * orig; 514 char * str; 515 char * str_end; 516 CleanAffix clean_affix; 517 }; 518 519 String get_stripped_chars(const Language & l); 520 521 String get_clean_chars(const Language & l); 522 523 PosibErr<void> check_if_sane(const Language & l, ParmStr word); 524 PosibErr<void> check_if_valid(const Language & l, ParmStr word); 525 PosibErr<void> validate_affix(const Language & l, ParmStr word, ParmStr aff); 526 527 bool find_language(Config & c); 528 529 PosibErr<Language *> new_language(const Config &, ParmStr lang = 0); 530 531 PosibErr<void> open_affix_file(const Config &, FStream & o); 532 } 533 534 535 #endif 536