1 /* 2 Copyright (c) 2006 - 2021 3 CLST - Radboud University 4 ILK - Tilburg University 5 6 This file is part of Ucto 7 8 Ucto is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or 11 (at your option) any later version. 12 13 Ucto is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 For questions and suggestions, see: 22 https://github.com/LanguageMachines/ucto/issues 23 or send mail to: 24 lamasoftware (at ) science.ru.nl 25 */ 26 27 #ifndef UCTO_SETTING_H 28 #define UCTO_SETTING_H 29 30 namespace TiCC { 31 class LogStream; 32 class UnicodeRegexMatcher; 33 class UniFilter; 34 } 35 36 namespace Tokenizer { 37 38 using namespace icu; 39 40 class Rule { 41 friend std::ostream& operator<< (std::ostream&, const Rule& ); 42 public: Rule()43 Rule(): regexp(0){ 44 }; 45 Rule( const UnicodeString& id, const UnicodeString& pattern); 46 ~Rule(); 47 UnicodeString id; 48 UnicodeString pattern; 49 TiCC::UnicodeRegexMatcher *regexp; 50 bool matchAll( const UnicodeString&, 51 UnicodeString&, 52 UnicodeString&, 53 std::vector<UnicodeString>& ); 54 private: 55 Rule( const Rule& ); // inhibit copies 56 Rule& operator=( const Rule& ); // inhibit copies 57 }; 58 59 60 class Quoting { 61 friend std::ostream& operator<<( std::ostream&, const Quoting& ); 62 struct QuotePair { 63 UnicodeString openQuote; 64 UnicodeString closeQuote; 65 }; 66 public: 67 void add( const UnicodeString&, const UnicodeString& ); 68 UnicodeString lookupOpen( const UnicodeString &) const; 69 UnicodeString lookupClose( const UnicodeString & ) const; empty()70 bool empty() const { return _quotes.empty(); }; emptyStack()71 bool emptyStack() const { return quotestack.empty(); }; clearStack()72 void clearStack() { quoteindexstack.clear(); quotestack.clear(); }; 73 int lookup( const UnicodeString&, int& ); eraseAtPos(int pos)74 void eraseAtPos( int pos ) { 75 quotestack.erase( quotestack.begin()+pos ); 76 quoteindexstack.erase( quoteindexstack.begin()+pos ); 77 } 78 void flushStack( int ); //renamed from eraseBeforeIndex push(int i,UChar32 c)79 void push( int i, UChar32 c ){ 80 quoteindexstack.push_back(i); 81 quotestack.push_back(c); 82 } 83 private: 84 std::vector<QuotePair> _quotes; 85 std::vector<int> quoteindexstack; 86 std::vector<UChar32> quotestack; 87 }; 88 89 class Setting { 90 public: 91 ~Setting(); 92 bool read( const std::string&, const std::string&, int, TiCC::LogStream* ); 93 bool read_rules( const std::string& ); 94 bool read_filters( const std::string& ); 95 bool read_quotes( const std::string& ); 96 bool read_eosmarkers( const std::string& ); 97 bool read_abbreviations( const std::string&, UnicodeString& ); 98 void add_rule( const UnicodeString&, 99 const std::vector<UnicodeString>& ); 100 void sort_rules( std::map<UnicodeString, Rule *>&, 101 const std::vector<UnicodeString>& ); 102 static std::set<std::string> installed_languages(); 103 UnicodeString eosmarkers; 104 std::map<UnicodeString, UnicodeString> macros; 105 std::vector<Rule *> rules; 106 std::map<UnicodeString, Rule *> rulesmap; 107 std::map<UnicodeString, int> rules_index; 108 std::string splitter; 109 Quoting quotes; 110 TiCC::UniFilter filter; 111 std::string set_file; // the name of the settingsfile 112 std::string version; // the version of the datafile 113 int tokDebug; 114 TiCC::LogStream *theErrLog; 115 }; 116 117 } // namespace Tokenizer 118 119 #endif 120