1 /*
2   Copyright (c) 2006 - 2021
3   CLST - Radboud University
4   ILK  - Tilburg University
5 
6   This file is part of Ucto
7 
8   Ucto is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   Ucto is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ucto/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 
26 */
27 
28 #include <unistd.h>
29 #include <cstring>
30 #include <cstdlib>
31 #include <iostream>
32 #include <fstream>
33 #include <sstream>
34 #include <vector>
35 #include <algorithm>
36 #include "config.h"
37 #include "ticcutils/StringOps.h"
38 #include "ticcutils/FileUtils.h"
39 #include "ticcutils/PrettyPrint.h"
40 #include "ticcutils/LogStream.h"
41 #include "ticcutils/Unicode.h"
42 #include "libfolia/folia.h"
43 #include "ucto/setting.h"
44 
45 using namespace std;
46 using TiCC::operator<<;
47 
48 #define LOG *TiCC::Log(theErrLog)
49 
50 #ifndef UCTODATA_DIR
51 #define UCTODATA_DIR string(SYSCONF_PATH) + "/ucto/"
52 #endif
53 
54 namespace Tokenizer {
55 
56   using namespace icu;
57   using TiCC::operator<<;
58 
59   string defaultConfigDir = UCTODATA_DIR;
60 
61   enum ConfigMode { NONE, RULES, ABBREVIATIONS, ATTACHEDPREFIXES,
62 		    ATTACHEDSUFFIXES, PREFIXES, SUFFIXES, TOKENS, UNITS,
63 		    ORDINALS, EOSMARKERS, QUOTES, CURRENCY,
64 		    FILTER, RULEORDER, METARULES };
65 
getMode(const UnicodeString & line)66   ConfigMode getMode( const UnicodeString& line ) {
67     ConfigMode mode = NONE;
68     if (line == "[RULES]") {
69       mode = RULES;
70     }
71     else if (line == "[META-RULES]") {
72       mode = METARULES;
73     }
74     else if (line == "[RULE-ORDER]") {
75       mode = RULEORDER;
76     }
77     else if (line == "[ABBREVIATIONS]") {
78       mode = ABBREVIATIONS;
79     }
80     else if (line == "[ATTACHEDPREFIXES]") {
81       mode = ATTACHEDPREFIXES;
82     }
83     else if (line == "[ATTACHEDSUFFIXES]") {
84       mode = ATTACHEDSUFFIXES;
85     }
86     else if (line == "[PREFIXES]") {
87       mode = PREFIXES;
88     }
89     else if (line == "[SUFFIXES]") {
90       mode = SUFFIXES;
91     }
92     else if (line == "[TOKENS]") {
93       mode = TOKENS;
94     }
95     else if (line == "[CURRENCY]") {
96       mode = CURRENCY;
97     }
98     else if (line == "[UNITS]") {
99       mode = UNITS;
100     }
101     else if (line == "[ORDINALS]") {
102       mode = ORDINALS;
103     }
104     else if (line == "[EOSMARKERS]") {
105       mode = EOSMARKERS;
106     }
107     else if (line == "[QUOTES]") {
108       mode = QUOTES;
109     }
110     else if (line == "[FILTER]") {
111       mode = FILTER;
112     }
113     else {
114       mode = NONE;
115     }
116     return mode;
117   }
118 
119   class uConfigError: public std::invalid_argument {
120   public:
uConfigError(const string & s,const string & f)121     uConfigError( const string& s, const string& f ):
122       invalid_argument( "ucto: " + s + " (" + f + ")"  ){};
uConfigError(const UnicodeString & us,const string & f)123     uConfigError( const UnicodeString& us, const string& f ):
124       uConfigError( TiCC::UnicodeToUTF8(us), f ){};
125   };
126 
127   class uLogicError: public std::logic_error {
128   public:
uLogicError(const string & s)129     explicit uLogicError( const string& s ): logic_error( "ucto: logic error:" + s ){};
130   };
131 
operator <<(ostream & os,const Quoting & q)132   ostream& operator<<( ostream& os, const Quoting& q ){
133     for( const auto& quote : q._quotes ){
134       os << quote.openQuote << "\t" << quote.closeQuote << endl;
135     }
136     return os;
137   }
138 
flushStack(int beginindex)139   void Quoting::flushStack( int beginindex ) {
140     //flush up to (but not including) the specified index
141     if ( !quotestack.empty() ){
142       std::vector<int> new_quoteindexstack;
143       std::vector<UChar32> new_quotestack;
144       for ( size_t i = 0; i < quotestack.size(); i++) {
145 	if (quoteindexstack[i] >= beginindex ) {
146 	  new_quotestack.push_back(quotestack[i]);
147 	  new_quoteindexstack.push_back(quoteindexstack[i]-beginindex);
148 	}
149       }
150       quoteindexstack = new_quoteindexstack;
151       quotestack = new_quotestack;
152     }
153   }
154 
add(const UnicodeString & o,const UnicodeString & c)155   void Quoting::add( const UnicodeString& o, const UnicodeString& c ){
156     QuotePair quote;
157     quote.openQuote = o;
158     quote.closeQuote = c;
159     _quotes.push_back( quote );
160   }
161 
lookup(const UnicodeString & open,int & stackindex)162   int Quoting::lookup( const UnicodeString& open, int& stackindex ){
163     if (quotestack.empty() || (quotestack.size() != quoteindexstack.size())) return -1;
164     auto it = quotestack.crbegin();
165     size_t i = quotestack.size();
166     while ( it != quotestack.crend() ){
167       if ( open.indexOf( *it ) >= 0 ){
168  	stackindex = i-1;
169  	return quoteindexstack[stackindex];
170       }
171       --i;
172       ++it;
173     }
174     return -1;
175   }
176 
lookupOpen(const UnicodeString & q) const177   UnicodeString Quoting::lookupOpen( const UnicodeString &q ) const {
178     auto res = find_if( _quotes.begin(),
179 			_quotes.end(),
180 			[q]( const QuotePair& qp){ return qp.openQuote.indexOf(q) >=0; } );
181     if ( res != _quotes.end() ){
182       return res->closeQuote;
183     }
184     else {
185       return "";
186     }
187   }
188 
lookupClose(const UnicodeString & q) const189   UnicodeString Quoting::lookupClose( const UnicodeString &q ) const {
190     UnicodeString res;
191     for ( const auto& quote : _quotes ){
192       if ( quote.closeQuote.indexOf(q) >= 0 )
193 	return quote.openQuote;
194     }
195     return "";
196   }
197 
~Rule()198   Rule::~Rule() {
199     delete regexp;
200   }
201 
Rule(const UnicodeString & _id,const UnicodeString & _pattern)202   Rule::Rule( const UnicodeString& _id, const UnicodeString& _pattern):
203     id(_id), pattern(_pattern) {
204     regexp = new TiCC::UnicodeRegexMatcher( pattern, id );
205   }
206 
operator <<(std::ostream & os,const Rule & r)207   ostream& operator<< (std::ostream& os, const Rule& r ){
208     if ( r.regexp ){
209       os << r.id << "=\"" << r.regexp->Pattern() << "\"";
210     }
211     else
212       os << r.id  << "=NULL";
213     return os;
214   }
215 
matchAll(const UnicodeString & line,UnicodeString & pre,UnicodeString & post,vector<UnicodeString> & matches)216   bool Rule::matchAll( const UnicodeString& line,
217 		       UnicodeString& pre,
218 		       UnicodeString& post,
219 		       vector<UnicodeString>& matches ){
220     matches.clear();
221     pre = "";
222     post = "";
223 #ifdef MATCH_DEBUG
224     cerr << "match: " << id << endl;
225 #endif
226     if ( regexp && regexp->match_all( line, pre, post ) ){
227       int num = regexp->NumOfMatches();
228       if ( num >=1 ){
229 	for( int i=1; i <= num; ++i ){
230 	  matches.push_back( regexp->get_match( i ) );
231 	}
232       }
233       else {
234 	matches.push_back( regexp->get_match( 0 ) );
235       }
236       return true;
237     }
238     return false;
239   }
240 
~Setting()241   Setting::~Setting(){
242     for ( const auto rule : rules ) {
243       delete rule;
244     }
245     rulesmap.clear();
246   }
247 
installed_languages()248   set<string> Setting::installed_languages() {
249     // we only return 'languages' which are installed as 'tokconfig-*'
250     //
251     vector<string> files = TiCC::searchFilesMatch( defaultConfigDir, "tokconfig-*" );
252     set<string> result;
253     for ( auto const& f : files ){
254       string base = TiCC::basename(f);
255       size_t pos = base.find("tokconfig-");
256       if ( pos == 0 ){
257 	string lang = base.substr( 10 );
258 	result.insert( lang );
259       }
260     }
261     return result;
262   }
263 
read_rules(const string & fname)264   bool Setting::read_rules( const string& fname ){
265     if ( tokDebug > 0 ){
266       LOG << "%include " << fname << endl;
267     }
268     ifstream f( fname );
269     if ( !f ){
270       return false;
271     }
272     else {
273       string rawline;
274       while ( getline( f, rawline ) ){
275 	UnicodeString line = TiCC::UnicodeFromUTF8(rawline);
276 	line.trim();
277 	if ((line.length() > 0) && (line[0] != '#')) {
278 	  if ( tokDebug >= 5 ){
279 	    LOG << "include line = " << rawline << endl;
280 	  }
281 	  const int splitpoint = line.indexOf("=");
282 	  if ( splitpoint < 0 ){
283 	    throw uConfigError( "invalid RULES entry: " + line,
284 				fname );
285 	  }
286 	  UnicodeString id = UnicodeString( line, 0,splitpoint);
287 	  UnicodeString pat = UnicodeString( line, splitpoint+1);
288 	  rulesmap[id] = new Rule( id, pat );
289 	}
290       }
291     }
292     return true;
293   }
294 
read_filters(const string & fname)295   bool Setting::read_filters( const string& fname ){
296     if ( tokDebug > 0 ){
297       LOG << "%include " << fname << endl;
298     }
299     return filter.fill( fname );
300   }
301 
read_quotes(const string & fname)302   bool Setting::read_quotes( const string& fname ){
303     if ( tokDebug > 0 ){
304       LOG << "%include " << fname << endl;
305     }
306     ifstream f( fname );
307     if ( !f ){
308       return false;
309     }
310     else {
311       string rawline;
312       while ( getline( f, rawline ) ){
313 	UnicodeString line = TiCC::UnicodeFromUTF8(rawline);
314 	line.trim();
315 	if ((line.length() > 0) && (line[0] != '#')) {
316 	  if ( tokDebug >= 5 ){
317 	    LOG << "include line = " << rawline << endl;
318 	  }
319 	  int splitpoint = line.indexOf(" ");
320 	  if ( splitpoint == -1 ){
321 	    splitpoint = line.indexOf("\t");
322 	  }
323 	  if ( splitpoint == -1 ){
324 	    throw uConfigError( "invalid QUOTES entry: " + line
325 				+ " (missing whitespace)",
326 				fname );
327 	  }
328 	  UnicodeString open = UnicodeString( line, 0,splitpoint);
329 	  UnicodeString close = UnicodeString( line, splitpoint+1);
330 	  open = open.trim().unescape();
331 	  close = close.trim().unescape();
332 	  if ( open.isEmpty() || close.isEmpty() ){
333 	    throw uConfigError( "invalid QUOTES entry: " + line, fname );
334 	  }
335 	  else {
336 	    quotes.add( open, close );
337 	  }
338 	}
339       }
340     }
341     return true;
342   }
343 
read_eosmarkers(const string & fname)344   bool Setting::read_eosmarkers( const string& fname ){
345     if ( tokDebug > 0 ){
346       LOG << "%include " << fname << endl;
347     }
348     ifstream f( fname );
349     if ( !f ){
350       return false;
351     }
352     else {
353       string rawline;
354       while ( getline( f, rawline ) ){
355 	UnicodeString line = TiCC::UnicodeFromUTF8(rawline);
356 	line.trim();
357 	if ((line.length() > 0) && (line[0] != '#')) {
358 	  if ( tokDebug >= 5 ){
359 	    LOG << "include line = " << rawline << endl;
360 	  }
361 	  if ( ( line.startsWith("\\u") && line.length() == 6 ) ||
362 	       ( line.startsWith("\\U") && line.length() == 10 ) ){
363 	    UnicodeString uit = line.unescape();
364 	    if ( uit.isEmpty() ){
365 	      throw uConfigError( "Invalid EOSMARKERS entry: " + line, fname );
366 	    }
367 	    eosmarkers += uit;
368 	  }
369 	}
370       }
371     }
372     return true;
373   }
374 
escape_regex(const UnicodeString & entry)375   UnicodeString escape_regex( const UnicodeString& entry ){
376     UnicodeString result;
377     for ( int i=0; i < entry.length(); ++i ){
378       switch ( entry[i] ){
379       case '?':
380       case '^':
381       case '$':
382       case '[':
383       case ']':
384       case '(':
385       case ')':
386       case '{':
387       case '}':
388       case '*':
389       case '.':
390       case '+':
391       case '|':
392       case '-':
393 	if ( i == 0 || entry[i-1] != '\\' ){
394 	  // not escaped
395 	  result += "\\";
396 	}
397 	// fallthrough
398       default:
399 	result += entry[i];
400       }
401     }
402     return result;
403   }
404 
read_abbreviations(const string & fname,UnicodeString & abbreviations)405   bool Setting::read_abbreviations( const string& fname,
406 				    UnicodeString& abbreviations ){
407     if ( tokDebug > 0 ){
408       LOG << "%include " << fname << endl;
409     }
410     ifstream f( fname );
411     if ( !f ){
412       return false;
413     }
414     else {
415       string rawline;
416       while ( getline( f, rawline ) ){
417 	UnicodeString line = TiCC::UnicodeFromUTF8(rawline);
418 	line.trim();
419 	if ((line.length() > 0) && (line[0] != '#')) {
420 	  if ( tokDebug >= 5 ){
421 	    LOG << "include line = " << rawline << endl;
422 	  }
423 	  line = escape_regex( line );
424 	  if ( !abbreviations.isEmpty()){
425 	    abbreviations += '|';
426 	  }
427 	  abbreviations += line;
428 	}
429       }
430     }
431     return true;
432   }
433 
add_rule(const UnicodeString & name,const vector<UnicodeString> & parts)434   void Setting::add_rule( const UnicodeString& name,
435 			  const vector<UnicodeString>& parts ){
436     UnicodeString pat;
437     for ( auto const& part : parts ){
438       pat += part;
439     }
440     rulesmap[name] = new Rule( name, pat );
441   }
442 
sort_rules(map<UnicodeString,Rule * > & rulesmap,const vector<UnicodeString> & sort)443   void Setting::sort_rules( map<UnicodeString, Rule *>& rulesmap,
444 			    const vector<UnicodeString>& sort ){
445     // LOG << "rules voor sort : " << endl;
446     // for ( size_t i=0; i < rules.size(); ++i ){
447     //   LOG << "rule " << i << " " << *rules[i] << endl;
448     // }
449     int index = 0;
450     if ( !sort.empty() ){
451       for ( auto const& id : sort ){
452 	auto it = rulesmap.find( id );
453 	if ( it != rulesmap.end() ){
454 	  rules.push_back( it->second );
455 	  rules_index[id] = ++index;
456 	  rulesmap.erase( it );
457 	}
458 	else {
459 	  LOG << set_file << ": RULE-ORDER specified for undefined RULE '"
460 	      << id << "'" << endl;
461 	}
462       }
463       for ( auto const& it : rulesmap ){
464 	LOG << set_file << ": No RULE-ORDER specified for RULE '"
465 	    << it.first << "' (put at end)." << endl;
466 	rules.push_back( it.second );
467 	rules_index[it.first] = ++index;
468       }
469     }
470     else {
471       for ( auto const& it : rulesmap ){
472 	rules.push_back( it.second );
473 	rules_index[it.first] = ++index;
474       }
475     }
476     // LOG << "rules NA sort : " << endl;
477     // for ( size_t i=0; i < result.size(); ++i ){
478     //   LOG << "rule " << i << " " << *result[i] << endl;
479     // }
480   }
481 
get_filename(const string & name)482   string get_filename( const string& name ){
483     string result;
484     if ( TiCC::isFile( name ) ){
485       result = name;
486     }
487     else {
488       result = defaultConfigDir + name;
489       if ( !TiCC::isFile( result ) ){
490 	result.clear();
491       }
492     }
493     return result;
494   }
495 
addOrder(vector<UnicodeString> & order,map<UnicodeString,int> & reverse_order,int & index,UnicodeString & line,const string & fn)496   void addOrder( vector<UnicodeString>& order,
497 		 map<UnicodeString,int>& reverse_order,
498 		 int& index,
499 		 UnicodeString &line,
500 		 const string& fn ){
501     try {
502       TiCC::UnicodeRegexMatcher m( "\\s+" );
503       vector<UnicodeString> usv;
504       m.split( line, usv );
505       for ( const auto& us : usv  ){
506 	if ( reverse_order.find( us ) != reverse_order.end() ){
507 	  throw uConfigError( "multiple entry " + us + " in RULE-ORDER", fn );
508 	}
509 	order.push_back( us );
510 	reverse_order[us] = ++index;
511       }
512     }
513     catch ( const uConfigError& ){
514       throw;
515     }
516     catch ( exception& e ){
517       throw uConfigError( "problem in line:" + line, "" );
518     }
519   }
520 
split(const string & version,int & major,int & minor,string & sub)521   void split( const string& version, int& major, int& minor, string& sub ){
522     vector<string> parts = TiCC::split_at( version, "." );
523     size_t num = parts.size();
524     major = 0;
525     minor = 0;
526     sub.clear();
527     if ( num == 0 ){
528       sub = version;
529     }
530     else if ( num == 1 ){
531       if ( !TiCC::stringTo( parts[0], major ) ){
532 	sub = version;
533       }
534     }
535     else if ( num == 2 ){
536       if ( !TiCC::stringTo( parts[0], major ) ){
537 	sub = version;
538       }
539       else if ( !TiCC::stringTo( parts[1], minor ) ){
540 	sub = parts[1];
541       }
542     }
543     else if ( num > 2 ){
544       if ( !TiCC::stringTo( parts[0], major ) ){
545 	sub = version;
546       }
547       else if ( !TiCC::stringTo( parts[1], minor ) ){
548 	sub = parts[1];
549       }
550       else {
551 	for ( size_t i=2; i < num; ++i ){
552 	  sub += parts[i];
553 	  if ( i < num-1 )
554 	    sub += ".";
555 	}
556       }
557     }
558   }
559 
substitute_macros(const UnicodeString & in,const map<UnicodeString,UnicodeString> & macros)560   UnicodeString substitute_macros( const UnicodeString& in,
561 				   const map<UnicodeString,UnicodeString>& macros ){
562     UnicodeString result = in;
563     for ( const auto& it : macros ){
564       result.findAndReplace( it.first, it.second );
565     }
566     return result;
567   }
568 
read(const string & settings_name,const string & add_tokens,int dbg,TiCC::LogStream * ls)569   bool Setting::read( const string& settings_name,
570 		      const string& add_tokens,
571 		      int dbg, TiCC::LogStream* ls ) {
572     tokDebug = dbg;
573     theErrLog = ls;
574     splitter = "%";
575     map<ConfigMode, UnicodeString> patterns = { { ABBREVIATIONS, "" },
576 						{ TOKENS, "" },
577 						{ PREFIXES, "" },
578 						{ SUFFIXES, "" },
579 						{ ATTACHEDPREFIXES, "" },
580 						{ ATTACHEDSUFFIXES, "" },
581 						{ UNITS, "" },
582 						{ ORDINALS, "" } };
583     vector<UnicodeString> rules_order;
584     vector<string> meta_rules;
585     string conffile = get_filename( settings_name );
586 
587     if ( !TiCC::isFile( conffile ) ){
588       LOG << "Unable to open configfile: " << conffile << endl;
589       return false;
590     }
591     if ( !add_tokens.empty() && !TiCC::isFile( add_tokens ) ){
592       LOG << "Unable to open additional tokens file: " << add_tokens << endl;
593       return false;
594     }
595     ifstream f( conffile );
596     if ( f ){
597       ConfigMode mode = NONE;
598       set_file = settings_name;
599       if ( tokDebug ){
600 	LOG << "config file=" << conffile << endl;
601       }
602       int rule_count = 0;
603       string rawline;
604       while ( getline( f, rawline ) ){
605 	if ( rawline.find( "%include" ) != string::npos ){
606 	  string file = rawline.substr( 9 );
607 	  switch ( mode ){
608 	  case RULES: {
609 	    if ( !TiCC::match_back( file, ".rule" ) ){
610 	      file += ".rule";
611 	    }
612 	    file = get_filename( file );
613 	    if ( !read_rules( file ) ){
614 	      throw uConfigError( "'" + rawline + "' failed", set_file );
615 	    }
616 	  }
617 	    break;
618 	  case FILTER:{
619 	    if ( !TiCC::match_back( file, ".filter" ) ){
620 	      file += ".filter";
621 	    }
622 	    file = get_filename( file );
623 	    if ( !read_filters( file ) ){
624 	      throw uConfigError( "'" + rawline + "' failed", set_file );
625 	    }
626 	  }
627 	    break;
628 	  case QUOTES:{
629 	    if ( !TiCC::match_back( file, ".quote" ) ){
630 	      file += ".quote";
631 	    }
632 	    file = get_filename( file );
633 	    if ( !read_quotes( file ) ){
634 	      throw uConfigError( "'" + rawline + "' failed", set_file );
635 	    }
636 	  }
637 	    break;
638 	  case EOSMARKERS:{
639 	    if ( !TiCC::match_back( file, ".eos" ) ){
640 	      file += ".eos";
641 	    }
642 	    file = get_filename( file );
643 	    if ( !read_eosmarkers( file ) ){
644 	      throw uConfigError( "'" + rawline + "' failed", set_file );
645 	    }
646 	  }
647 	    break;
648 	  case ABBREVIATIONS:{
649 	    if ( !TiCC::match_back( file, ".abr" ) ){
650 	      file += ".abr";
651 	    }
652 	    file = get_filename( file );
653 	    if ( !read_abbreviations( file, patterns[ABBREVIATIONS] ) ){
654 	      throw uConfigError( "'" + rawline + "' failed", set_file );
655 	    }
656 	  }
657 	    break;
658 	  default:
659 	    throw uConfigError( string("%include not implemented for this section"),
660 				set_file );
661 	  }
662 	  continue;
663 	}
664 	else if ( rawline.find( "%define" ) != string::npos ){
665 	  string def = rawline.substr( 8 );
666 	  vector<string> parts = TiCC::split_at_first_of( def, " \t", 2 );
667 	  if ( parts.size() < 2 ){
668 	    throw uConfigError( "invalid %define: " + rawline, set_file );
669 	  }
670 	  UnicodeString macro = TiCC::UnicodeFromUTF8(splitter)
671 	    + TiCC::UnicodeFromUTF8(parts[0]) + TiCC::UnicodeFromUTF8(splitter);
672 	  macros[macro] = TiCC::UnicodeFromUTF8(parts[1]);
673 	  continue;
674 	}
675 	else if ( rawline.find( "SPLITTER=" ) != string::npos ){
676 	  string local_splitter = rawline.substr( 9 );
677 	  if ( local_splitter.empty() ) {
678 	    throw uConfigError( "invalid SPLITTER value in: " + rawline,
679 				set_file );
680 	  }
681 	  if ( local_splitter[0] == '"'
682 	       && local_splitter[local_splitter.length()-1] == '"' ){
683 	    local_splitter = local_splitter.substr(1,local_splitter.length()-2);
684 	  }
685 	  if ( tokDebug > 5 ){
686 	    LOG << "SET SPLITTER: '" << local_splitter << "'" << endl;
687 	  }
688 	  if ( local_splitter != splitter ){
689 	    LOG << "updating splitter to: '" << local_splitter << "'" << endl;
690 	  }
691 	  splitter = local_splitter;
692 	  continue;
693 	}
694 
695 	UnicodeString line = TiCC::UnicodeFromUTF8(rawline);
696 	line.trim();
697 	if ((line.length() > 0) && (line[0] != '#')) {
698 	  if (line[0] == '[') {
699 	    mode = getMode( line );
700 	  }
701 	  else {
702 	    if ( line[0] == '\\' && line.length() > 1 && line[1] == '[' ){
703 	      line = UnicodeString( line, 1 );
704 	    }
705 	    line = substitute_macros( line, macros );
706 	    switch( mode ){
707 	    case RULES: {
708 	      const int splitpoint = line.indexOf("=");
709 	      if ( splitpoint < 0 ){
710 		throw uConfigError( "invalid RULES entry: " + line,
711 				    set_file );
712 	      }
713 	      UnicodeString id = UnicodeString( line, 0,splitpoint);
714 	      UnicodeString pat = UnicodeString( line, splitpoint+1);
715 	      rulesmap[id] = new Rule( id, pat );
716 	    }
717 	      break;
718 	    case RULEORDER:
719 	      addOrder( rules_order, rules_index,
720 			rule_count, line, set_file );
721 	      break;
722 	    case METARULES:
723 	      meta_rules.push_back( TiCC::UnicodeToUTF8(line) );
724 	      break;
725 	    case ABBREVIATIONS:
726 	    case ATTACHEDPREFIXES:
727 	    case ATTACHEDSUFFIXES:
728 	    case PREFIXES:
729 	    case SUFFIXES:
730 	    case TOKENS:
731 	    case CURRENCY:
732 	    case UNITS:
733 	    case ORDINALS:
734 	      if ( !patterns[mode].isEmpty() )
735 		patterns[mode] += '|';
736 	      patterns[mode] += line;
737 	      break;
738 	    case EOSMARKERS:
739 	      if ( ( line.startsWith("\\u") && line.length() == 6 ) ||
740 		   ( line.startsWith("\\U") && line.length() == 10 ) ){
741 		UnicodeString uit = line.unescape();
742 		if ( uit.isEmpty() ){
743 		  throw uConfigError( "Invalid EOSMARKERS entry: " + line,
744 				      set_file );
745 		}
746 		eosmarkers += uit;
747 	      }
748 	      break;
749 	    case QUOTES: {
750 	      int splitpoint = line.indexOf(" ");
751 	      if ( splitpoint == -1 )
752 		splitpoint = line.indexOf("\t");
753 	      if ( splitpoint == -1 ){
754 		throw uConfigError( "invalid QUOTES entry: " + line
755 				    + " (missing whitespace)",
756 				    set_file );
757 	      }
758 	      UnicodeString open = UnicodeString( line, 0,splitpoint);
759 	      UnicodeString close = UnicodeString( line, splitpoint+1);
760 	      open = open.trim().unescape();
761 	      close = close.trim().unescape();
762 	      if ( open.isEmpty() || close.isEmpty() ){
763 		throw uConfigError( "invalid QUOTES entry: " + line,
764 				    set_file );
765 	      }
766 	      else {
767 		quotes.add( open, close );
768 	      }
769 	    }
770 	      break;
771 	    case FILTER:
772 	      filter.add( line );
773 	      break;
774 	    case NONE: {
775 	      vector<string> parts = TiCC::split_at( rawline, "=" );
776 	      if ( parts.size() == 2 ) {
777 		if ( parts[0] == "version" ){
778 		  version = parts[1];
779 		}
780 	      }
781 	    }
782 	      break;
783 	    default:
784 	      throw uLogicError( "unhandled case in switch" );
785 	    }
786 	  }
787 	}
788       }
789 
790       // set reasonable defaults for those items that are NOT set
791       // in the configfile
792       if ( eosmarkers.length() == 0 ){
793 	eosmarkers = ".!?";
794       }
795       if ( quotes.empty() ){
796 	quotes.add( '"', '"' );
797 	quotes.add( "‘", "’" );
798 	quotes.add( "“„‟", "”" );
799       }
800 
801       if ( !add_tokens.empty() ){
802 	ifstream adt( add_tokens );
803 	string line;
804 	while ( getline( adt, line ) ){
805 	  UnicodeString entry = TiCC::UnicodeFromUTF8(line);
806 	  entry = escape_regex( entry );
807 	  if ( !entry.isEmpty() ){
808 	    if ( !patterns[TOKENS].isEmpty() ){
809 	      patterns[TOKENS] += '|';
810 	    }
811 	    patterns[TOKENS] += entry;
812 	  }
813 	}
814       }
815       // Create Rules for every pattern that is set
816       // first the meta rules...
817       for ( const auto& mr : meta_rules ){
818 	string::size_type pos = mr.find( "=" );
819 	if ( pos == string::npos ){
820 	  throw uConfigError( "invalid entry in META-RULES: " + mr,
821 			      set_file );
822 	}
823 	string nam = TiCC::trim( mr.substr( 0, pos ) );
824 	if ( nam == "SPLITTER" ){
825 	  string local_splitter = mr.substr( pos+1 );
826 	  if ( local_splitter.empty() ) {
827 	    throw uConfigError( "invalid SPLITTER value in META-RULES: " + mr,
828 				set_file );
829 	  }
830 	  if ( local_splitter[0] == '"'
831 	       && local_splitter[local_splitter.length()-1] == '"' ){
832 	    local_splitter = local_splitter.substr(1,local_splitter.length()-2);
833 	  }
834 	  if ( tokDebug > 5 ){
835 	    LOG << "SET SPLITTER: '" << local_splitter << "'" << endl;
836 	  }
837 	  if ( local_splitter != splitter ){
838 	    LOG << "updating splitter to: '" << local_splitter << "'" << endl;
839 	  }
840 	  splitter = local_splitter;
841 	  continue;
842 	}
843 	UnicodeString name = TiCC::UnicodeFromUTF8( nam );
844 	string rule = mr.substr( pos+1 );
845 	if ( tokDebug > 5 ){
846 	  LOG << "SPLIT using: '" << splitter << "'" << endl;
847 	}
848 	vector<string> parts = TiCC::split_at( rule, splitter );
849 	for ( auto& str : parts ){
850 	  str = TiCC::trim( str );
851 	}
852 	vector<UnicodeString> new_parts;
853 	vector<UnicodeString> undef_parts;
854 	bool skip_rule = false;
855 	for ( const auto& part : parts ){
856 	  UnicodeString meta = TiCC::UnicodeFromUTF8( part );
857 	  ConfigMode local_mode = getMode( "[" + meta + "]" );
858 	  switch ( local_mode ){
859 	  case ORDINALS:
860 	  case ABBREVIATIONS:
861 	  case TOKENS:
862 	  case ATTACHEDPREFIXES:
863 	  case ATTACHEDSUFFIXES:
864 	  case UNITS:
865 	  case CURRENCY:
866 	  case PREFIXES:
867 	  case SUFFIXES:
868 	    if ( !patterns[local_mode].isEmpty()){
869 	      UnicodeString val = substitute_macros( patterns[local_mode],
870 						     macros );
871 	      new_parts.push_back( val );
872 	    }
873 	    else {
874 	      undef_parts.push_back( meta );
875 	      skip_rule = true;
876 	    }
877 	    break;
878 	  case NONE:
879 	  default:
880 	    new_parts.push_back( substitute_macros( TiCC::UnicodeFromUTF8(part),
881 						    macros ) );
882 	    break;
883 	  }
884 	}
885 	if ( skip_rule ){
886 	  using TiCC::operator<<;
887 	  LOG << set_file << ": skipping META rule: '" << name
888 	      << "', it mentions unknown pattern: '"
889 	      << undef_parts <<"'" << endl;
890 	}
891 	else {
892 	  add_rule( name, new_parts );
893 	}
894       }
895       sort_rules( rulesmap, rules_order );
896     }
897     else {
898       return false;
899     }
900     int major = -1;
901     int minor = -1;
902     string sub;
903     if ( !version.empty() ){
904       split( version, major, minor, sub );
905       if ( tokDebug ){
906 	LOG << set_file << ": version=" << version << endl;
907       }
908     }
909     if ( major < 0 || minor < 2 ){
910       if ( version.empty() ){
911 	LOG << "WARNING: your datafile for '" + set_file
912 	    << "' is missing a version number" << endl;
913 	LOG << "         Did you install uctodata version >=0.2 ?" << endl;
914 	LOG << "         or do you use your own setingsfile? Then please add a version number." << endl;
915       }
916       else {
917 	LOG << "WARNING: your datafile '" + set_file
918 	    << "' has version: " << version << endl;
919 	LOG << "         for best results, you should a file with version >=0.2 " << endl;
920       }
921     }
922     if ( tokDebug ){
923       LOG << "effective rules: " << endl;
924       for ( size_t i=0; i < rules.size(); ++i ){
925 	LOG << "rule " << i << " " << *rules[i] << endl;
926       }
927       LOG << "EOS markers: " << eosmarkers << endl;
928       LOG << "Quotations: " << quotes << endl;
929       try {
930 	LOG << "Filter: " << filter << endl;
931       }
932       catch (...){
933       }
934     }
935     return true;
936   }
937 
938 }//namespace
939