1 /*
2   Copyright (c) 2006 - 2021
3   CLST - Radboud University
4   ILK  - Tilburg University
5 
6   This file is part of Ucto
7 
8   Ucto is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   Ucto is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ucto/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 */
26 
27 #ifndef UCTO_SETTING_H
28 #define UCTO_SETTING_H
29 
30 namespace TiCC {
31   class LogStream;
32   class UnicodeRegexMatcher;
33   class UniFilter;
34 }
35 
36 namespace Tokenizer {
37 
38   using namespace icu;
39 
40   class Rule {
41     friend std::ostream& operator<< (std::ostream&, const Rule& );
42   public:
Rule()43   Rule(): regexp(0){
44     };
45     Rule( const UnicodeString& id, const UnicodeString& pattern);
46     ~Rule();
47     UnicodeString id;
48     UnicodeString pattern;
49     TiCC::UnicodeRegexMatcher *regexp;
50     bool matchAll( const UnicodeString&,
51 		   UnicodeString&,
52 		   UnicodeString&,
53 		   std::vector<UnicodeString>& );
54   private:
55     Rule( const Rule& ); // inhibit copies
56     Rule& operator=( const Rule& ); // inhibit copies
57   };
58 
59 
60   class Quoting {
61     friend std::ostream& operator<<( std::ostream&, const Quoting& );
62     struct QuotePair {
63       UnicodeString openQuote;
64       UnicodeString closeQuote;
65     };
66   public:
67     void add( const UnicodeString&, const UnicodeString& );
68     UnicodeString lookupOpen( const UnicodeString &) const;
69     UnicodeString lookupClose( const UnicodeString & ) const;
empty()70     bool empty() const { return _quotes.empty(); };
emptyStack()71     bool emptyStack() const { return quotestack.empty(); };
clearStack()72     void clearStack() { quoteindexstack.clear(); quotestack.clear(); };
73     int lookup( const UnicodeString&, int& );
eraseAtPos(int pos)74     void eraseAtPos( int pos ) {
75       quotestack.erase( quotestack.begin()+pos );
76       quoteindexstack.erase( quoteindexstack.begin()+pos );
77     }
78     void flushStack( int ); //renamed from eraseBeforeIndex
push(int i,UChar32 c)79     void push( int i, UChar32 c ){
80       quoteindexstack.push_back(i);
81       quotestack.push_back(c);
82     }
83   private:
84     std::vector<QuotePair> _quotes;
85     std::vector<int> quoteindexstack;
86     std::vector<UChar32> quotestack;
87   };
88 
89   class Setting {
90   public:
91     ~Setting();
92     bool read( const std::string&, const std::string&, int, TiCC::LogStream* );
93     bool read_rules( const std::string& );
94     bool read_filters( const std::string& );
95     bool read_quotes( const std::string& );
96     bool read_eosmarkers( const std::string& );
97     bool read_abbreviations( const std::string&,  UnicodeString& );
98     void add_rule( const UnicodeString&,
99 		   const std::vector<UnicodeString>& );
100     void sort_rules( std::map<UnicodeString, Rule *>&,
101 		     const std::vector<UnicodeString>& );
102     static std::set<std::string> installed_languages();
103     UnicodeString eosmarkers;
104     std::map<UnicodeString, UnicodeString> macros;
105     std::vector<Rule *> rules;
106     std::map<UnicodeString, Rule *> rulesmap;
107     std::map<UnicodeString, int> rules_index;
108     std::string splitter;
109     Quoting quotes;
110     TiCC::UniFilter filter;
111     std::string set_file; // the name of the settingsfile
112     std::string version;  // the version of the datafile
113     int tokDebug;
114     TiCC::LogStream *theErrLog;
115   };
116 
117 } // namespace Tokenizer
118 
119 #endif
120