1 /* 2 Copyright (c) 2006 - 2021 3 CLST - Radboud University 4 ILK - Tilburg University 5 6 This file is part of Ucto 7 8 Ucto is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or 11 (at your option) any later version. 12 13 Ucto is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 For questions and suggestions, see: 22 https://github.com/LanguageMachines/ucto/issues 23 or send mail to: 24 lamasoftware (at ) science.ru.nl 25 26 */ 27 28 #include <unistd.h> 29 #include <cstring> 30 #include <cstdlib> 31 #include <iostream> 32 #include <fstream> 33 #include <sstream> 34 #include <vector> 35 #include <algorithm> 36 #include "config.h" 37 #include "ticcutils/StringOps.h" 38 #include "ticcutils/FileUtils.h" 39 #include "ticcutils/PrettyPrint.h" 40 #include "ticcutils/LogStream.h" 41 #include "ticcutils/Unicode.h" 42 #include "libfolia/folia.h" 43 #include "ucto/setting.h" 44 45 using namespace std; 46 using TiCC::operator<<; 47 48 #define LOG *TiCC::Log(theErrLog) 49 50 #ifndef UCTODATA_DIR 51 #define UCTODATA_DIR string(SYSCONF_PATH) + "/ucto/" 52 #endif 53 54 namespace Tokenizer { 55 56 using namespace icu; 57 using TiCC::operator<<; 58 59 string defaultConfigDir = UCTODATA_DIR; 60 61 enum ConfigMode { NONE, RULES, ABBREVIATIONS, ATTACHEDPREFIXES, 62 ATTACHEDSUFFIXES, PREFIXES, SUFFIXES, TOKENS, UNITS, 63 ORDINALS, EOSMARKERS, QUOTES, CURRENCY, 64 FILTER, RULEORDER, METARULES }; 65 getMode(const UnicodeString & line)66 ConfigMode getMode( const UnicodeString& line ) { 67 ConfigMode mode = NONE; 68 if (line == "[RULES]") { 69 mode = RULES; 70 } 71 else if (line == "[META-RULES]") { 72 mode = METARULES; 73 } 74 else if (line == "[RULE-ORDER]") { 75 mode = RULEORDER; 76 } 77 else if (line == "[ABBREVIATIONS]") { 78 mode = ABBREVIATIONS; 79 } 80 else if (line == "[ATTACHEDPREFIXES]") { 81 mode = ATTACHEDPREFIXES; 82 } 83 else if (line == "[ATTACHEDSUFFIXES]") { 84 mode = ATTACHEDSUFFIXES; 85 } 86 else if (line == "[PREFIXES]") { 87 mode = PREFIXES; 88 } 89 else if (line == "[SUFFIXES]") { 90 mode = SUFFIXES; 91 } 92 else if (line == "[TOKENS]") { 93 mode = TOKENS; 94 } 95 else if (line == "[CURRENCY]") { 96 mode = CURRENCY; 97 } 98 else if (line == "[UNITS]") { 99 mode = UNITS; 100 } 101 else if (line == "[ORDINALS]") { 102 mode = ORDINALS; 103 } 104 else if (line == "[EOSMARKERS]") { 105 mode = EOSMARKERS; 106 } 107 else if (line == "[QUOTES]") { 108 mode = QUOTES; 109 } 110 else if (line == "[FILTER]") { 111 mode = FILTER; 112 } 113 else { 114 mode = NONE; 115 } 116 return mode; 117 } 118 119 class uConfigError: public std::invalid_argument { 120 public: uConfigError(const string & s,const string & f)121 uConfigError( const string& s, const string& f ): 122 invalid_argument( "ucto: " + s + " (" + f + ")" ){}; uConfigError(const UnicodeString & us,const string & f)123 uConfigError( const UnicodeString& us, const string& f ): 124 uConfigError( TiCC::UnicodeToUTF8(us), f ){}; 125 }; 126 127 class uLogicError: public std::logic_error { 128 public: uLogicError(const string & s)129 explicit uLogicError( const string& s ): logic_error( "ucto: logic error:" + s ){}; 130 }; 131 operator <<(ostream & os,const Quoting & q)132 ostream& operator<<( ostream& os, const Quoting& q ){ 133 for( const auto& quote : q._quotes ){ 134 os << quote.openQuote << "\t" << quote.closeQuote << endl; 135 } 136 return os; 137 } 138 flushStack(int beginindex)139 void Quoting::flushStack( int beginindex ) { 140 //flush up to (but not including) the specified index 141 if ( !quotestack.empty() ){ 142 std::vector<int> new_quoteindexstack; 143 std::vector<UChar32> new_quotestack; 144 for ( size_t i = 0; i < quotestack.size(); i++) { 145 if (quoteindexstack[i] >= beginindex ) { 146 new_quotestack.push_back(quotestack[i]); 147 new_quoteindexstack.push_back(quoteindexstack[i]-beginindex); 148 } 149 } 150 quoteindexstack = new_quoteindexstack; 151 quotestack = new_quotestack; 152 } 153 } 154 add(const UnicodeString & o,const UnicodeString & c)155 void Quoting::add( const UnicodeString& o, const UnicodeString& c ){ 156 QuotePair quote; 157 quote.openQuote = o; 158 quote.closeQuote = c; 159 _quotes.push_back( quote ); 160 } 161 lookup(const UnicodeString & open,int & stackindex)162 int Quoting::lookup( const UnicodeString& open, int& stackindex ){ 163 if (quotestack.empty() || (quotestack.size() != quoteindexstack.size())) return -1; 164 auto it = quotestack.crbegin(); 165 size_t i = quotestack.size(); 166 while ( it != quotestack.crend() ){ 167 if ( open.indexOf( *it ) >= 0 ){ 168 stackindex = i-1; 169 return quoteindexstack[stackindex]; 170 } 171 --i; 172 ++it; 173 } 174 return -1; 175 } 176 lookupOpen(const UnicodeString & q) const177 UnicodeString Quoting::lookupOpen( const UnicodeString &q ) const { 178 auto res = find_if( _quotes.begin(), 179 _quotes.end(), 180 [q]( const QuotePair& qp){ return qp.openQuote.indexOf(q) >=0; } ); 181 if ( res != _quotes.end() ){ 182 return res->closeQuote; 183 } 184 else { 185 return ""; 186 } 187 } 188 lookupClose(const UnicodeString & q) const189 UnicodeString Quoting::lookupClose( const UnicodeString &q ) const { 190 UnicodeString res; 191 for ( const auto& quote : _quotes ){ 192 if ( quote.closeQuote.indexOf(q) >= 0 ) 193 return quote.openQuote; 194 } 195 return ""; 196 } 197 ~Rule()198 Rule::~Rule() { 199 delete regexp; 200 } 201 Rule(const UnicodeString & _id,const UnicodeString & _pattern)202 Rule::Rule( const UnicodeString& _id, const UnicodeString& _pattern): 203 id(_id), pattern(_pattern) { 204 regexp = new TiCC::UnicodeRegexMatcher( pattern, id ); 205 } 206 operator <<(std::ostream & os,const Rule & r)207 ostream& operator<< (std::ostream& os, const Rule& r ){ 208 if ( r.regexp ){ 209 os << r.id << "=\"" << r.regexp->Pattern() << "\""; 210 } 211 else 212 os << r.id << "=NULL"; 213 return os; 214 } 215 matchAll(const UnicodeString & line,UnicodeString & pre,UnicodeString & post,vector<UnicodeString> & matches)216 bool Rule::matchAll( const UnicodeString& line, 217 UnicodeString& pre, 218 UnicodeString& post, 219 vector<UnicodeString>& matches ){ 220 matches.clear(); 221 pre = ""; 222 post = ""; 223 #ifdef MATCH_DEBUG 224 cerr << "match: " << id << endl; 225 #endif 226 if ( regexp && regexp->match_all( line, pre, post ) ){ 227 int num = regexp->NumOfMatches(); 228 if ( num >=1 ){ 229 for( int i=1; i <= num; ++i ){ 230 matches.push_back( regexp->get_match( i ) ); 231 } 232 } 233 else { 234 matches.push_back( regexp->get_match( 0 ) ); 235 } 236 return true; 237 } 238 return false; 239 } 240 ~Setting()241 Setting::~Setting(){ 242 for ( const auto rule : rules ) { 243 delete rule; 244 } 245 rulesmap.clear(); 246 } 247 installed_languages()248 set<string> Setting::installed_languages() { 249 // we only return 'languages' which are installed as 'tokconfig-*' 250 // 251 vector<string> files = TiCC::searchFilesMatch( defaultConfigDir, "tokconfig-*" ); 252 set<string> result; 253 for ( auto const& f : files ){ 254 string base = TiCC::basename(f); 255 size_t pos = base.find("tokconfig-"); 256 if ( pos == 0 ){ 257 string lang = base.substr( 10 ); 258 result.insert( lang ); 259 } 260 } 261 return result; 262 } 263 read_rules(const string & fname)264 bool Setting::read_rules( const string& fname ){ 265 if ( tokDebug > 0 ){ 266 LOG << "%include " << fname << endl; 267 } 268 ifstream f( fname ); 269 if ( !f ){ 270 return false; 271 } 272 else { 273 string rawline; 274 while ( getline( f, rawline ) ){ 275 UnicodeString line = TiCC::UnicodeFromUTF8(rawline); 276 line.trim(); 277 if ((line.length() > 0) && (line[0] != '#')) { 278 if ( tokDebug >= 5 ){ 279 LOG << "include line = " << rawline << endl; 280 } 281 const int splitpoint = line.indexOf("="); 282 if ( splitpoint < 0 ){ 283 throw uConfigError( "invalid RULES entry: " + line, 284 fname ); 285 } 286 UnicodeString id = UnicodeString( line, 0,splitpoint); 287 UnicodeString pat = UnicodeString( line, splitpoint+1); 288 rulesmap[id] = new Rule( id, pat ); 289 } 290 } 291 } 292 return true; 293 } 294 read_filters(const string & fname)295 bool Setting::read_filters( const string& fname ){ 296 if ( tokDebug > 0 ){ 297 LOG << "%include " << fname << endl; 298 } 299 return filter.fill( fname ); 300 } 301 read_quotes(const string & fname)302 bool Setting::read_quotes( const string& fname ){ 303 if ( tokDebug > 0 ){ 304 LOG << "%include " << fname << endl; 305 } 306 ifstream f( fname ); 307 if ( !f ){ 308 return false; 309 } 310 else { 311 string rawline; 312 while ( getline( f, rawline ) ){ 313 UnicodeString line = TiCC::UnicodeFromUTF8(rawline); 314 line.trim(); 315 if ((line.length() > 0) && (line[0] != '#')) { 316 if ( tokDebug >= 5 ){ 317 LOG << "include line = " << rawline << endl; 318 } 319 int splitpoint = line.indexOf(" "); 320 if ( splitpoint == -1 ){ 321 splitpoint = line.indexOf("\t"); 322 } 323 if ( splitpoint == -1 ){ 324 throw uConfigError( "invalid QUOTES entry: " + line 325 + " (missing whitespace)", 326 fname ); 327 } 328 UnicodeString open = UnicodeString( line, 0,splitpoint); 329 UnicodeString close = UnicodeString( line, splitpoint+1); 330 open = open.trim().unescape(); 331 close = close.trim().unescape(); 332 if ( open.isEmpty() || close.isEmpty() ){ 333 throw uConfigError( "invalid QUOTES entry: " + line, fname ); 334 } 335 else { 336 quotes.add( open, close ); 337 } 338 } 339 } 340 } 341 return true; 342 } 343 read_eosmarkers(const string & fname)344 bool Setting::read_eosmarkers( const string& fname ){ 345 if ( tokDebug > 0 ){ 346 LOG << "%include " << fname << endl; 347 } 348 ifstream f( fname ); 349 if ( !f ){ 350 return false; 351 } 352 else { 353 string rawline; 354 while ( getline( f, rawline ) ){ 355 UnicodeString line = TiCC::UnicodeFromUTF8(rawline); 356 line.trim(); 357 if ((line.length() > 0) && (line[0] != '#')) { 358 if ( tokDebug >= 5 ){ 359 LOG << "include line = " << rawline << endl; 360 } 361 if ( ( line.startsWith("\\u") && line.length() == 6 ) || 362 ( line.startsWith("\\U") && line.length() == 10 ) ){ 363 UnicodeString uit = line.unescape(); 364 if ( uit.isEmpty() ){ 365 throw uConfigError( "Invalid EOSMARKERS entry: " + line, fname ); 366 } 367 eosmarkers += uit; 368 } 369 } 370 } 371 } 372 return true; 373 } 374 escape_regex(const UnicodeString & entry)375 UnicodeString escape_regex( const UnicodeString& entry ){ 376 UnicodeString result; 377 for ( int i=0; i < entry.length(); ++i ){ 378 switch ( entry[i] ){ 379 case '?': 380 case '^': 381 case '$': 382 case '[': 383 case ']': 384 case '(': 385 case ')': 386 case '{': 387 case '}': 388 case '*': 389 case '.': 390 case '+': 391 case '|': 392 case '-': 393 if ( i == 0 || entry[i-1] != '\\' ){ 394 // not escaped 395 result += "\\"; 396 } 397 // fallthrough 398 default: 399 result += entry[i]; 400 } 401 } 402 return result; 403 } 404 read_abbreviations(const string & fname,UnicodeString & abbreviations)405 bool Setting::read_abbreviations( const string& fname, 406 UnicodeString& abbreviations ){ 407 if ( tokDebug > 0 ){ 408 LOG << "%include " << fname << endl; 409 } 410 ifstream f( fname ); 411 if ( !f ){ 412 return false; 413 } 414 else { 415 string rawline; 416 while ( getline( f, rawline ) ){ 417 UnicodeString line = TiCC::UnicodeFromUTF8(rawline); 418 line.trim(); 419 if ((line.length() > 0) && (line[0] != '#')) { 420 if ( tokDebug >= 5 ){ 421 LOG << "include line = " << rawline << endl; 422 } 423 line = escape_regex( line ); 424 if ( !abbreviations.isEmpty()){ 425 abbreviations += '|'; 426 } 427 abbreviations += line; 428 } 429 } 430 } 431 return true; 432 } 433 add_rule(const UnicodeString & name,const vector<UnicodeString> & parts)434 void Setting::add_rule( const UnicodeString& name, 435 const vector<UnicodeString>& parts ){ 436 UnicodeString pat; 437 for ( auto const& part : parts ){ 438 pat += part; 439 } 440 rulesmap[name] = new Rule( name, pat ); 441 } 442 sort_rules(map<UnicodeString,Rule * > & rulesmap,const vector<UnicodeString> & sort)443 void Setting::sort_rules( map<UnicodeString, Rule *>& rulesmap, 444 const vector<UnicodeString>& sort ){ 445 // LOG << "rules voor sort : " << endl; 446 // for ( size_t i=0; i < rules.size(); ++i ){ 447 // LOG << "rule " << i << " " << *rules[i] << endl; 448 // } 449 int index = 0; 450 if ( !sort.empty() ){ 451 for ( auto const& id : sort ){ 452 auto it = rulesmap.find( id ); 453 if ( it != rulesmap.end() ){ 454 rules.push_back( it->second ); 455 rules_index[id] = ++index; 456 rulesmap.erase( it ); 457 } 458 else { 459 LOG << set_file << ": RULE-ORDER specified for undefined RULE '" 460 << id << "'" << endl; 461 } 462 } 463 for ( auto const& it : rulesmap ){ 464 LOG << set_file << ": No RULE-ORDER specified for RULE '" 465 << it.first << "' (put at end)." << endl; 466 rules.push_back( it.second ); 467 rules_index[it.first] = ++index; 468 } 469 } 470 else { 471 for ( auto const& it : rulesmap ){ 472 rules.push_back( it.second ); 473 rules_index[it.first] = ++index; 474 } 475 } 476 // LOG << "rules NA sort : " << endl; 477 // for ( size_t i=0; i < result.size(); ++i ){ 478 // LOG << "rule " << i << " " << *result[i] << endl; 479 // } 480 } 481 get_filename(const string & name)482 string get_filename( const string& name ){ 483 string result; 484 if ( TiCC::isFile( name ) ){ 485 result = name; 486 } 487 else { 488 result = defaultConfigDir + name; 489 if ( !TiCC::isFile( result ) ){ 490 result.clear(); 491 } 492 } 493 return result; 494 } 495 addOrder(vector<UnicodeString> & order,map<UnicodeString,int> & reverse_order,int & index,UnicodeString & line,const string & fn)496 void addOrder( vector<UnicodeString>& order, 497 map<UnicodeString,int>& reverse_order, 498 int& index, 499 UnicodeString &line, 500 const string& fn ){ 501 try { 502 TiCC::UnicodeRegexMatcher m( "\\s+" ); 503 vector<UnicodeString> usv; 504 m.split( line, usv ); 505 for ( const auto& us : usv ){ 506 if ( reverse_order.find( us ) != reverse_order.end() ){ 507 throw uConfigError( "multiple entry " + us + " in RULE-ORDER", fn ); 508 } 509 order.push_back( us ); 510 reverse_order[us] = ++index; 511 } 512 } 513 catch ( const uConfigError& ){ 514 throw; 515 } 516 catch ( exception& e ){ 517 throw uConfigError( "problem in line:" + line, "" ); 518 } 519 } 520 split(const string & version,int & major,int & minor,string & sub)521 void split( const string& version, int& major, int& minor, string& sub ){ 522 vector<string> parts = TiCC::split_at( version, "." ); 523 size_t num = parts.size(); 524 major = 0; 525 minor = 0; 526 sub.clear(); 527 if ( num == 0 ){ 528 sub = version; 529 } 530 else if ( num == 1 ){ 531 if ( !TiCC::stringTo( parts[0], major ) ){ 532 sub = version; 533 } 534 } 535 else if ( num == 2 ){ 536 if ( !TiCC::stringTo( parts[0], major ) ){ 537 sub = version; 538 } 539 else if ( !TiCC::stringTo( parts[1], minor ) ){ 540 sub = parts[1]; 541 } 542 } 543 else if ( num > 2 ){ 544 if ( !TiCC::stringTo( parts[0], major ) ){ 545 sub = version; 546 } 547 else if ( !TiCC::stringTo( parts[1], minor ) ){ 548 sub = parts[1]; 549 } 550 else { 551 for ( size_t i=2; i < num; ++i ){ 552 sub += parts[i]; 553 if ( i < num-1 ) 554 sub += "."; 555 } 556 } 557 } 558 } 559 substitute_macros(const UnicodeString & in,const map<UnicodeString,UnicodeString> & macros)560 UnicodeString substitute_macros( const UnicodeString& in, 561 const map<UnicodeString,UnicodeString>& macros ){ 562 UnicodeString result = in; 563 for ( const auto& it : macros ){ 564 result.findAndReplace( it.first, it.second ); 565 } 566 return result; 567 } 568 read(const string & settings_name,const string & add_tokens,int dbg,TiCC::LogStream * ls)569 bool Setting::read( const string& settings_name, 570 const string& add_tokens, 571 int dbg, TiCC::LogStream* ls ) { 572 tokDebug = dbg; 573 theErrLog = ls; 574 splitter = "%"; 575 map<ConfigMode, UnicodeString> patterns = { { ABBREVIATIONS, "" }, 576 { TOKENS, "" }, 577 { PREFIXES, "" }, 578 { SUFFIXES, "" }, 579 { ATTACHEDPREFIXES, "" }, 580 { ATTACHEDSUFFIXES, "" }, 581 { UNITS, "" }, 582 { ORDINALS, "" } }; 583 vector<UnicodeString> rules_order; 584 vector<string> meta_rules; 585 string conffile = get_filename( settings_name ); 586 587 if ( !TiCC::isFile( conffile ) ){ 588 LOG << "Unable to open configfile: " << conffile << endl; 589 return false; 590 } 591 if ( !add_tokens.empty() && !TiCC::isFile( add_tokens ) ){ 592 LOG << "Unable to open additional tokens file: " << add_tokens << endl; 593 return false; 594 } 595 ifstream f( conffile ); 596 if ( f ){ 597 ConfigMode mode = NONE; 598 set_file = settings_name; 599 if ( tokDebug ){ 600 LOG << "config file=" << conffile << endl; 601 } 602 int rule_count = 0; 603 string rawline; 604 while ( getline( f, rawline ) ){ 605 if ( rawline.find( "%include" ) != string::npos ){ 606 string file = rawline.substr( 9 ); 607 switch ( mode ){ 608 case RULES: { 609 if ( !TiCC::match_back( file, ".rule" ) ){ 610 file += ".rule"; 611 } 612 file = get_filename( file ); 613 if ( !read_rules( file ) ){ 614 throw uConfigError( "'" + rawline + "' failed", set_file ); 615 } 616 } 617 break; 618 case FILTER:{ 619 if ( !TiCC::match_back( file, ".filter" ) ){ 620 file += ".filter"; 621 } 622 file = get_filename( file ); 623 if ( !read_filters( file ) ){ 624 throw uConfigError( "'" + rawline + "' failed", set_file ); 625 } 626 } 627 break; 628 case QUOTES:{ 629 if ( !TiCC::match_back( file, ".quote" ) ){ 630 file += ".quote"; 631 } 632 file = get_filename( file ); 633 if ( !read_quotes( file ) ){ 634 throw uConfigError( "'" + rawline + "' failed", set_file ); 635 } 636 } 637 break; 638 case EOSMARKERS:{ 639 if ( !TiCC::match_back( file, ".eos" ) ){ 640 file += ".eos"; 641 } 642 file = get_filename( file ); 643 if ( !read_eosmarkers( file ) ){ 644 throw uConfigError( "'" + rawline + "' failed", set_file ); 645 } 646 } 647 break; 648 case ABBREVIATIONS:{ 649 if ( !TiCC::match_back( file, ".abr" ) ){ 650 file += ".abr"; 651 } 652 file = get_filename( file ); 653 if ( !read_abbreviations( file, patterns[ABBREVIATIONS] ) ){ 654 throw uConfigError( "'" + rawline + "' failed", set_file ); 655 } 656 } 657 break; 658 default: 659 throw uConfigError( string("%include not implemented for this section"), 660 set_file ); 661 } 662 continue; 663 } 664 else if ( rawline.find( "%define" ) != string::npos ){ 665 string def = rawline.substr( 8 ); 666 vector<string> parts = TiCC::split_at_first_of( def, " \t", 2 ); 667 if ( parts.size() < 2 ){ 668 throw uConfigError( "invalid %define: " + rawline, set_file ); 669 } 670 UnicodeString macro = TiCC::UnicodeFromUTF8(splitter) 671 + TiCC::UnicodeFromUTF8(parts[0]) + TiCC::UnicodeFromUTF8(splitter); 672 macros[macro] = TiCC::UnicodeFromUTF8(parts[1]); 673 continue; 674 } 675 else if ( rawline.find( "SPLITTER=" ) != string::npos ){ 676 string local_splitter = rawline.substr( 9 ); 677 if ( local_splitter.empty() ) { 678 throw uConfigError( "invalid SPLITTER value in: " + rawline, 679 set_file ); 680 } 681 if ( local_splitter[0] == '"' 682 && local_splitter[local_splitter.length()-1] == '"' ){ 683 local_splitter = local_splitter.substr(1,local_splitter.length()-2); 684 } 685 if ( tokDebug > 5 ){ 686 LOG << "SET SPLITTER: '" << local_splitter << "'" << endl; 687 } 688 if ( local_splitter != splitter ){ 689 LOG << "updating splitter to: '" << local_splitter << "'" << endl; 690 } 691 splitter = local_splitter; 692 continue; 693 } 694 695 UnicodeString line = TiCC::UnicodeFromUTF8(rawline); 696 line.trim(); 697 if ((line.length() > 0) && (line[0] != '#')) { 698 if (line[0] == '[') { 699 mode = getMode( line ); 700 } 701 else { 702 if ( line[0] == '\\' && line.length() > 1 && line[1] == '[' ){ 703 line = UnicodeString( line, 1 ); 704 } 705 line = substitute_macros( line, macros ); 706 switch( mode ){ 707 case RULES: { 708 const int splitpoint = line.indexOf("="); 709 if ( splitpoint < 0 ){ 710 throw uConfigError( "invalid RULES entry: " + line, 711 set_file ); 712 } 713 UnicodeString id = UnicodeString( line, 0,splitpoint); 714 UnicodeString pat = UnicodeString( line, splitpoint+1); 715 rulesmap[id] = new Rule( id, pat ); 716 } 717 break; 718 case RULEORDER: 719 addOrder( rules_order, rules_index, 720 rule_count, line, set_file ); 721 break; 722 case METARULES: 723 meta_rules.push_back( TiCC::UnicodeToUTF8(line) ); 724 break; 725 case ABBREVIATIONS: 726 case ATTACHEDPREFIXES: 727 case ATTACHEDSUFFIXES: 728 case PREFIXES: 729 case SUFFIXES: 730 case TOKENS: 731 case CURRENCY: 732 case UNITS: 733 case ORDINALS: 734 if ( !patterns[mode].isEmpty() ) 735 patterns[mode] += '|'; 736 patterns[mode] += line; 737 break; 738 case EOSMARKERS: 739 if ( ( line.startsWith("\\u") && line.length() == 6 ) || 740 ( line.startsWith("\\U") && line.length() == 10 ) ){ 741 UnicodeString uit = line.unescape(); 742 if ( uit.isEmpty() ){ 743 throw uConfigError( "Invalid EOSMARKERS entry: " + line, 744 set_file ); 745 } 746 eosmarkers += uit; 747 } 748 break; 749 case QUOTES: { 750 int splitpoint = line.indexOf(" "); 751 if ( splitpoint == -1 ) 752 splitpoint = line.indexOf("\t"); 753 if ( splitpoint == -1 ){ 754 throw uConfigError( "invalid QUOTES entry: " + line 755 + " (missing whitespace)", 756 set_file ); 757 } 758 UnicodeString open = UnicodeString( line, 0,splitpoint); 759 UnicodeString close = UnicodeString( line, splitpoint+1); 760 open = open.trim().unescape(); 761 close = close.trim().unescape(); 762 if ( open.isEmpty() || close.isEmpty() ){ 763 throw uConfigError( "invalid QUOTES entry: " + line, 764 set_file ); 765 } 766 else { 767 quotes.add( open, close ); 768 } 769 } 770 break; 771 case FILTER: 772 filter.add( line ); 773 break; 774 case NONE: { 775 vector<string> parts = TiCC::split_at( rawline, "=" ); 776 if ( parts.size() == 2 ) { 777 if ( parts[0] == "version" ){ 778 version = parts[1]; 779 } 780 } 781 } 782 break; 783 default: 784 throw uLogicError( "unhandled case in switch" ); 785 } 786 } 787 } 788 } 789 790 // set reasonable defaults for those items that are NOT set 791 // in the configfile 792 if ( eosmarkers.length() == 0 ){ 793 eosmarkers = ".!?"; 794 } 795 if ( quotes.empty() ){ 796 quotes.add( '"', '"' ); 797 quotes.add( "‘", "’" ); 798 quotes.add( "“„‟", "”" ); 799 } 800 801 if ( !add_tokens.empty() ){ 802 ifstream adt( add_tokens ); 803 string line; 804 while ( getline( adt, line ) ){ 805 UnicodeString entry = TiCC::UnicodeFromUTF8(line); 806 entry = escape_regex( entry ); 807 if ( !entry.isEmpty() ){ 808 if ( !patterns[TOKENS].isEmpty() ){ 809 patterns[TOKENS] += '|'; 810 } 811 patterns[TOKENS] += entry; 812 } 813 } 814 } 815 // Create Rules for every pattern that is set 816 // first the meta rules... 817 for ( const auto& mr : meta_rules ){ 818 string::size_type pos = mr.find( "=" ); 819 if ( pos == string::npos ){ 820 throw uConfigError( "invalid entry in META-RULES: " + mr, 821 set_file ); 822 } 823 string nam = TiCC::trim( mr.substr( 0, pos ) ); 824 if ( nam == "SPLITTER" ){ 825 string local_splitter = mr.substr( pos+1 ); 826 if ( local_splitter.empty() ) { 827 throw uConfigError( "invalid SPLITTER value in META-RULES: " + mr, 828 set_file ); 829 } 830 if ( local_splitter[0] == '"' 831 && local_splitter[local_splitter.length()-1] == '"' ){ 832 local_splitter = local_splitter.substr(1,local_splitter.length()-2); 833 } 834 if ( tokDebug > 5 ){ 835 LOG << "SET SPLITTER: '" << local_splitter << "'" << endl; 836 } 837 if ( local_splitter != splitter ){ 838 LOG << "updating splitter to: '" << local_splitter << "'" << endl; 839 } 840 splitter = local_splitter; 841 continue; 842 } 843 UnicodeString name = TiCC::UnicodeFromUTF8( nam ); 844 string rule = mr.substr( pos+1 ); 845 if ( tokDebug > 5 ){ 846 LOG << "SPLIT using: '" << splitter << "'" << endl; 847 } 848 vector<string> parts = TiCC::split_at( rule, splitter ); 849 for ( auto& str : parts ){ 850 str = TiCC::trim( str ); 851 } 852 vector<UnicodeString> new_parts; 853 vector<UnicodeString> undef_parts; 854 bool skip_rule = false; 855 for ( const auto& part : parts ){ 856 UnicodeString meta = TiCC::UnicodeFromUTF8( part ); 857 ConfigMode local_mode = getMode( "[" + meta + "]" ); 858 switch ( local_mode ){ 859 case ORDINALS: 860 case ABBREVIATIONS: 861 case TOKENS: 862 case ATTACHEDPREFIXES: 863 case ATTACHEDSUFFIXES: 864 case UNITS: 865 case CURRENCY: 866 case PREFIXES: 867 case SUFFIXES: 868 if ( !patterns[local_mode].isEmpty()){ 869 UnicodeString val = substitute_macros( patterns[local_mode], 870 macros ); 871 new_parts.push_back( val ); 872 } 873 else { 874 undef_parts.push_back( meta ); 875 skip_rule = true; 876 } 877 break; 878 case NONE: 879 default: 880 new_parts.push_back( substitute_macros( TiCC::UnicodeFromUTF8(part), 881 macros ) ); 882 break; 883 } 884 } 885 if ( skip_rule ){ 886 using TiCC::operator<<; 887 LOG << set_file << ": skipping META rule: '" << name 888 << "', it mentions unknown pattern: '" 889 << undef_parts <<"'" << endl; 890 } 891 else { 892 add_rule( name, new_parts ); 893 } 894 } 895 sort_rules( rulesmap, rules_order ); 896 } 897 else { 898 return false; 899 } 900 int major = -1; 901 int minor = -1; 902 string sub; 903 if ( !version.empty() ){ 904 split( version, major, minor, sub ); 905 if ( tokDebug ){ 906 LOG << set_file << ": version=" << version << endl; 907 } 908 } 909 if ( major < 0 || minor < 2 ){ 910 if ( version.empty() ){ 911 LOG << "WARNING: your datafile for '" + set_file 912 << "' is missing a version number" << endl; 913 LOG << " Did you install uctodata version >=0.2 ?" << endl; 914 LOG << " or do you use your own setingsfile? Then please add a version number." << endl; 915 } 916 else { 917 LOG << "WARNING: your datafile '" + set_file 918 << "' has version: " << version << endl; 919 LOG << " for best results, you should a file with version >=0.2 " << endl; 920 } 921 } 922 if ( tokDebug ){ 923 LOG << "effective rules: " << endl; 924 for ( size_t i=0; i < rules.size(); ++i ){ 925 LOG << "rule " << i << " " << *rules[i] << endl; 926 } 927 LOG << "EOS markers: " << eosmarkers << endl; 928 LOG << "Quotations: " << quotes << endl; 929 try { 930 LOG << "Filter: " << filter << endl; 931 } 932 catch (...){ 933 } 934 } 935 return true; 936 } 937 938 }//namespace 939