1 /* Copyright (C) 2013, 2019 Olga Yakovleva <yakovleva.o.v@gmail.com> */ 2 3 /* This program is free software: you can redistribute it and/or modify */ 4 /* it under the terms of the GNU Lesser General Public License as published by */ 5 /* the Free Software Foundation, either version 2.1 of the License, or */ 6 /* (at your option) any later version. */ 7 8 /* This program is distributed in the hope that it will be useful, */ 9 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ 10 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ 11 /* GNU Lesser General Public License for more details. */ 12 13 /* You should have received a copy of the GNU Lesser General Public License */ 14 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */ 15 16 #ifndef RHVOICE_USERDICT_HPP 17 #define RHVOICE_USERDICT_HPP 18 19 #include <stdexcept> 20 #include <memory> 21 #include <vector> 22 #include <iterator> 23 #include <algorithm> 24 #include <sstream> 25 26 #include "str.hpp" 27 #include "item.hpp" 28 #include "relation.hpp" 29 #include "utterance.hpp" 30 #include "stress_pattern.hpp" 31 #include "trie.hpp" 32 33 namespace RHVoice 34 { 35 class language_info; 36 37 namespace userdict 38 { 39 typedef std::vector<utf8::uint32_t> chars32; 40 41 enum 42 { 43 token_start=0x10ffff+1, 44 token_end 45 }; 46 47 class position: public std::iterator<std::forward_iterator_tag,utf8::uint32_t> 48 { 49 public: position()50 position(): 51 token(0), 52 text(0), 53 character(token_start) 54 { 55 } 56 position(item & token_)57 explicit position(item& token_) 58 { 59 set_token(token_); 60 } 61 62 explicit position(utterance& utt); 63 operator *() const64 utf8::uint32_t operator*() const 65 { 66 return character; 67 } 68 operator ==(const position & other) const69 bool operator==(const position& other) const 70 { 71 if(token!=other.token) 72 return false; 73 if(token==0) 74 return true; 75 if(pos!=other.pos) 76 return false; 77 return (character==other.character); 78 } 79 operator !=(const position & other) const80 bool operator!=(const position& other) const 81 { 82 return !(*this==other); 83 } 84 operator ++()85 position& operator++() 86 { 87 if(token==0) 88 return *this; 89 if(character==token_end) 90 forward_token(); 91 else if(pos==text->end()) 92 { 93 character=token_end; 94 } 95 else 96 character=utf8::next(pos,text->end()); 97 return *this; 98 } 99 operator ++(int)100 position operator++(int) 101 { 102 position tmp=*this; 103 ++(*this); 104 return tmp; 105 } 106 get_token()107 item& get_token() 108 { 109 return *token; 110 } 111 get_token() const112 item& get_token() const 113 { 114 return *token; 115 } 116 same_token(const position & other) const117 bool same_token(const position& other) const 118 { 119 return (token==other.token); 120 } 121 122 void forward_token(); 123 124 private: clear()125 void clear() 126 { 127 token=0; 128 text=0; 129 pos=std::string::const_iterator(); 130 character=token_start; 131 } 132 set_token(item & token_)133 void set_token(item& token_) 134 { 135 token=&token_; 136 text=&(token->get("name").as<std::string>()); 137 pos=text->begin(); 138 character=token_start; 139 } 140 141 item* token; 142 const std::string* text; 143 std::string::const_iterator pos; 144 utf8::uint32_t character; 145 }; 146 147 class word_editor 148 { 149 public: word_editor(utterance & u)150 explicit word_editor(utterance& u): 151 utt(u), 152 cursor(u), 153 changed(false), 154 initialism(false) 155 { 156 } 157 get_utt()158 utterance& get_utt() 159 { 160 return utt; 161 } 162 get_cursor() const163 position get_cursor() const 164 { 165 return cursor; 166 } 167 delete_char()168 void delete_char() 169 { 170 utf8::uint32_t c=*cursor; 171 if(c>=token_start) 172 throw std::logic_error("Nothing to delete"); 173 ++cursor; 174 changed=true; 175 } 176 insert_char(utf8::uint32_t c)177 void insert_char(utf8::uint32_t c) 178 { 179 text.push_back(c); 180 changed=true; 181 } 182 forward_char()183 void forward_char() 184 { 185 utf8::uint32_t c=*cursor; 186 switch(c) 187 { 188 case token_start: 189 ++cursor; 190 break; 191 case token_end: 192 save_word(); 193 new_word(); 194 ++cursor; 195 break; 196 default: 197 text.push_back(c); 198 ++cursor; 199 break; 200 } 201 } 202 split_word()203 void split_word() 204 { 205 changed=true; 206 save_word(); 207 new_word(); 208 changed=true; 209 } 210 211 void mark_stress(); 212 stress_syllable(int number)213 void stress_syllable(int number) 214 { 215 stress.stress_syllable(number); 216 changed=true; 217 } 218 unstress_word()219 void unstress_word() 220 { 221 stress.unstress(); 222 changed=true; 223 } 224 decode_word_as_initialism()225 void decode_word_as_initialism() 226 { 227 initialism=true; 228 changed=true; 229 } 230 forward_token()231 void forward_token() 232 { 233 cursor.forward_token(); 234 } 235 236 private: 237 word_editor(const word_editor&); 238 word_editor& operator=(const word_editor&); 239 240 void save_word(); 241 void new_word(); 242 243 utterance& utt; 244 position cursor; 245 chars32 text; 246 bool changed; 247 stress_pattern stress; 248 bool initialism; 249 }; 250 251 class token 252 { 253 public: token()254 token(): 255 type(0) 256 { 257 } 258 append(utf8::uint32_t c)259 void append(utf8::uint32_t c) 260 { 261 text.push_back(c); 262 } 263 264 template<typename iterator> append(iterator first,iterator last)265 void append(iterator first,iterator last) 266 { 267 text.insert(text.end(),first,last); 268 } 269 get_type() const270 int get_type() const 271 { 272 return type; 273 } 274 set_type(int type_)275 void set_type(int type_) 276 { 277 type=type_; 278 } 279 get_text() const280 const chars32& get_text() const 281 { 282 return text; 283 } 284 as_string() const285 std::string as_string() const 286 { 287 std::string result; 288 std::copy(text.begin(),text.end(),str::utf8_inserter(std::back_inserter(result))); 289 return result; 290 } 291 as_number() const292 int as_number() const 293 { 294 int n=0; 295 std::istringstream s(as_string()); 296 s >> n; 297 return n; 298 } 299 300 private: 301 token(const token&); 302 token& operator=(const token&); 303 304 int type; 305 chars32 text; 306 }; 307 308 class correction 309 { 310 public: 311 typedef std::shared_ptr<correction> pointer; 312 ~correction()313 virtual ~correction() 314 { 315 } 316 get_key() const317 virtual chars32 get_key() const 318 { 319 return chars32(); 320 } 321 322 void virtual apply(word_editor& ed) const=0; 323 std::string virtual describe() const=0; 324 325 protected: correction()326 correction() 327 { 328 } 329 330 private: 331 correction(const correction&); 332 correction& operator=(const correction&); 333 }; 334 335 class substring: public correction 336 { 337 public: substring(const token * t)338 explicit substring(const token* t): 339 key(t->get_text()) 340 { 341 } 342 get_key() const343 chars32 get_key() const 344 { 345 return key; 346 } 347 apply(word_editor & ed) const348 void apply(word_editor& ed) const 349 { 350 for(std::size_t i=0;i<key.size();++i) 351 { 352 ed.forward_char(); 353 } 354 } 355 describe() const356 std::string describe() const 357 { 358 std::string desc("(Substring "); 359 utf8::utf32to8(key.begin(),key.end(),std::back_inserter(desc)); 360 desc+=')'; 361 return desc; 362 } 363 364 protected: 365 chars32 key; 366 }; 367 368 class symbol: public correction 369 { 370 public: symbol(const token * t)371 explicit symbol(const token* t): 372 chr(t->get_text()[0]) 373 { 374 } 375 get_key() const376 chars32 get_key() const 377 { 378 return chars32(1,chr); 379 } 380 apply(word_editor & ed) const381 void apply(word_editor& ed) const 382 { 383 ed.delete_char(); 384 } 385 describe() const386 std::string describe() const 387 { 388 std::string desc("(Symbol "); 389 utf8::append(chr,std::back_inserter(desc)); 390 desc+=')'; 391 return desc; 392 } 393 394 private: 395 utf8::uint32_t chr; 396 }; 397 398 class deletion: public substring 399 { 400 public: deletion(const token * t)401 explicit deletion(const token* t): 402 substring(t) 403 { 404 } 405 apply(word_editor & ed) const406 void apply(word_editor& ed) const 407 { 408 for(std::size_t i=0;i<key.size();++i) 409 { 410 ed.delete_char(); 411 } 412 } 413 describe() const414 std::string describe() const 415 { 416 std::string desc("(Delete "); 417 utf8::utf32to8(key.begin(),key.end(),std::back_inserter(desc)); 418 desc+=')'; 419 return desc; 420 } 421 }; 422 423 class insertion: public correction 424 { 425 public: insertion(const token * t)426 explicit insertion(const token* t): 427 text(t->get_text()) 428 { 429 } 430 apply(word_editor & ed) const431 void apply(word_editor& ed) const 432 { 433 for(chars32::const_iterator it=text.begin();it!=text.end();++it) 434 { 435 ed.insert_char(*it); 436 } 437 } 438 describe() const439 std::string describe() const 440 { 441 std::string desc("(Insert "); 442 utf8::utf32to8(text.begin(),text.end(),std::back_inserter(desc)); 443 desc+=')'; 444 return desc; 445 } 446 447 private: 448 chars32 text; 449 }; 450 451 class empty_string: public correction 452 { 453 public: apply(word_editor & ed) const454 void apply(word_editor& ed) const 455 { 456 } 457 describe() const458 std::string describe() const 459 { 460 return "EmptyString"; 461 } 462 }; 463 464 class stress_mark: public correction 465 { 466 public: apply(word_editor & ed) const467 void apply(word_editor& ed) const 468 { 469 ed.mark_stress(); 470 } 471 describe() const472 std::string describe() const 473 { 474 return "StressMark"; 475 } 476 }; 477 478 class stressed_syl_number: public correction 479 { 480 public: stressed_syl_number(const token * t)481 explicit stressed_syl_number(const token* t): 482 number(t->as_number()) 483 { 484 } 485 apply(word_editor & ed) const486 void apply(word_editor& ed) const 487 { 488 ed.stress_syllable(number); 489 } 490 describe() const491 std::string describe() const 492 { 493 std::ostringstream s; 494 s << "(StressSyllable " << number << ")" << std::endl; 495 return s.str(); 496 } 497 498 private: 499 int number; 500 }; 501 502 class unstressed_flag: public correction 503 { 504 public: apply(word_editor & ed) const505 void apply(word_editor& ed) const 506 { 507 ed.unstress_word(); 508 } 509 describe() const510 std::string describe() const 511 { 512 return "Unstressed"; 513 } 514 }; 515 516 class initialism_flag: public correction 517 { 518 public: apply(word_editor & ed) const519 void apply(word_editor& ed) const 520 { 521 ed.decode_word_as_initialism(); 522 } 523 describe() const524 std::string describe() const 525 { 526 return "Initialism"; 527 } 528 }; 529 530 class start_of_token: public correction 531 { 532 public: get_key() const533 chars32 get_key() const 534 { 535 return chars32(1,token_start); 536 } 537 apply(word_editor & ed) const538 void apply(word_editor& ed) const 539 { 540 ed.forward_char(); 541 } 542 describe() const543 std::string describe() const 544 { 545 return "StartOfToken"; 546 } 547 }; 548 549 class end_of_token: public correction 550 { 551 public: get_key() const552 chars32 get_key() const 553 { 554 return chars32(1,token_end); 555 } 556 apply(word_editor & ed) const557 void apply(word_editor& ed) const 558 { 559 ed.forward_char(); 560 } 561 describe() const562 std::string describe() const 563 { 564 return "EndOfToken"; 565 } 566 }; 567 568 class word_break: public correction 569 { 570 public: apply(word_editor & ed) const571 void apply(word_editor& ed) const 572 { 573 ed.split_word(); 574 } 575 describe() const576 std::string describe() const 577 { 578 return "WordBreak"; 579 } 580 }; 581 582 class rule 583 { 584 private: 585 typedef std::vector<correction::pointer> list_of_corrections; 586 list_of_corrections corrections; 587 588 public: empty() const589 bool empty() const 590 { 591 return corrections.empty(); 592 } 593 get_key() const594 chars32 get_key() const 595 { 596 chars32 result; 597 for(list_of_corrections::const_iterator it=corrections.begin();it!=corrections.end();++it) 598 { 599 chars32 key=(*it)->get_key(); 600 result.insert(result.end(),key.begin(),key.end()); 601 } 602 return result; 603 } 604 apply(word_editor & ed) const605 void apply(word_editor& ed) const 606 { 607 for(list_of_corrections::const_iterator it=corrections.begin();it!=corrections.end();++it) 608 { 609 (*it)->apply(ed); 610 } 611 } 612 describe() const613 std::string describe() const 614 { 615 std::string desc; 616 for(list_of_corrections::const_iterator it=corrections.begin();it!=corrections.end();++it) 617 { 618 desc+=(*it)->describe(); 619 desc+=' '; 620 } 621 return desc; 622 } 623 append(const rule & other)624 void append(const rule& other) 625 { 626 if(other.empty()) 627 return; 628 corrections.reserve(corrections.size()+other.corrections.size()); 629 corrections.insert(corrections.end(),other.corrections.begin(),other.corrections.end()); 630 } 631 632 template<class T> append()633 void append() 634 { 635 corrections.push_back(correction::pointer(new T)); 636 } 637 638 template<class T,typename A> append(const A & a)639 void append(const A& a) 640 { 641 corrections.push_back(correction::pointer(new T(a))); 642 } 643 }; 644 645 class ruleset 646 { 647 private: 648 typedef std::vector<rule> list_of_rules; 649 list_of_rules rules; 650 651 public: ruleset()652 ruleset() 653 { 654 } 655 656 typedef std::vector<rule>::const_iterator iterator; 657 begin() const658 iterator begin() const 659 { 660 return rules.begin(); 661 } 662 end() const663 iterator end() const 664 { 665 return rules.end(); 666 } 667 empty() const668 bool empty() const 669 { 670 return rules.empty(); 671 } 672 describe() const673 std::string describe() const 674 { 675 std::string desc; 676 for(iterator it=begin();it!=end();++it) 677 { 678 desc+=it->describe(); 679 desc+='\n'; 680 } 681 return desc; 682 } 683 684 void append(const ruleset* other); 685 686 template<class T> append()687 void append() 688 { 689 rule r; 690 r.append<T>(); 691 append(r); 692 } 693 694 template<class T,typename A> append(const A & a)695 void append(const A& a) 696 { 697 rule r; 698 r.append<T,A>(a); 699 append(r); 700 } 701 extend(const ruleset * other)702 void extend(const ruleset* other) 703 { 704 if(other->empty()) 705 return; 706 rules.reserve(rules.size()+other->rules.size()); 707 rules.insert(rules.end(),other->rules.begin(),other->rules.end()); 708 } 709 710 template<class T> extend()711 void extend() 712 { 713 rule r; 714 r.append<T>(); 715 extend(r); 716 } 717 718 template<class T,typename A> extend(const A & a)719 void extend(const A& a) 720 { 721 rule r; 722 r.append<T,A>(a); 723 extend(r); 724 } 725 726 template<class T> create()727 static inline ruleset* create() 728 { 729 std::unique_ptr<ruleset> rs(new ruleset); 730 rs->append<T>(); 731 return rs.release(); 732 } 733 734 template<class T,typename A> create(const A & a)735 static inline ruleset* create(const A& a) 736 { 737 std::unique_ptr<ruleset> rs(new ruleset); 738 rs->append<T,A>(a); 739 return rs.release(); 740 } 741 742 private: 743 ruleset(const ruleset&); 744 ruleset& operator=(const ruleset&); 745 746 void append(const rule& r); 747 extend(const rule & r)748 void extend(const rule& r) 749 { 750 rules.push_back(r); 751 } 752 }; 753 754 struct parse_state 755 { 756 std::unique_ptr<ruleset> result; 757 bool error; 758 parse_stateRHVoice::userdict::parse_state759 parse_state(): 760 result(new ruleset), 761 error(false) 762 { 763 } 764 }; 765 766 class dict 767 { 768 public: 769 explicit dict(const language_info& lng); 770 void apply_rules(utterance& u) const; 771 772 private: 773 dict(const dict&); 774 dict& operator=(const dict&); 775 776 struct to_lower 777 { operator ()RHVoice::userdict::dict::to_lower778 utf8::uint32_t operator()(utf8::uint32_t c) const 779 { 780 if(c>=token_start) 781 return c; 782 else 783 return str::tolower(c); 784 } 785 }; 786 787 void load_all(); 788 void load_dir(const std::string& path); 789 void load_file(const std::string& file_path); 790 bool should_ignore_token(const position& pos) const; 791 792 const language_info& lang; 793 trie<utf8::uint32_t,rule,to_lower> rules; 794 }; 795 } 796 } 797 #endif 798