1 /* 2 Copyright (c) 2006 - 2021 3 CLST - Radboud University 4 ILK - Tilburg University 5 6 This file is part of ticcutils 7 8 ticcutils is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or 11 (at your option) any later version. 12 13 ticcutils is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, see <http://www.gnu.org/licenses/>. 20 21 For questions and suggestions, see: 22 https://github.com/LanguageMachines/ticcutils/issues 23 or send mail to: 24 lamasoftware (at ) science.ru.nl 25 26 */ 27 28 #include "ticcutils/Unicode.h" 29 #include <exception> 30 #include <stdexcept> 31 #include <iostream> 32 #include <fstream> 33 #include "unicode/normalizer2.h" 34 #include "unicode/ustream.h" 35 #include "ticcutils/StringOps.h" 36 37 using namespace std; 38 39 namespace TiCC { 40 using namespace icu; 41 UnicodeFromEnc(const string & s,const string & enc)42 UnicodeString UnicodeFromEnc( const string& s, const string& enc ){ 43 /// convert a character buffer in some encoding to a UnicodeString 44 /*! 45 \param s the string to interpret as a character buffer 46 \param enc the encoding to use 47 \return an UnicodeString object 48 */ 49 return UnicodeString( s.c_str(), s.length(), enc.c_str() ); 50 } 51 UnicodeToUTF8(const UnicodeString & s)52 string UnicodeToUTF8( const UnicodeString& s ){ 53 /// convert a UnicodeString to a UTF-8 string 54 /*! 55 \param s the UnicodeString to convert 56 \return an UTF-8 encoded string 57 */ 58 string result; 59 s.toUTF8String(result); 60 return result; 61 } 62 UnicodeNormalizer(const string & enc)63 UnicodeNormalizer::UnicodeNormalizer( const string& enc ): _normalizer(0) { 64 /// create an UnicodeNormalizer object 65 /*! 66 \param enc a string describing the wanted normalization. 67 valid values are: NFC (the default), NFD, NFKC, NFKD 68 */ 69 string mode = enc; 70 if ( mode.empty() ){ 71 mode = "NFC"; 72 } 73 setMode(mode); 74 } 75 ~UnicodeNormalizer()76 UnicodeNormalizer::~UnicodeNormalizer(){ 77 /// destroy the UnicodeNormalizer 78 // NEVER EVER delete _normalizer! 79 } 80 setMode(const string & enc)81 const string UnicodeNormalizer::setMode( const string& enc ){ 82 /// set the desired normalizer mode 83 /*! 84 \param enc the new mode to set 85 \return the previous mode 86 */ 87 if ( enc == mode 88 || (enc.empty() && mode == "NFC") ){ 89 return mode; 90 } 91 else { 92 // NEVER EVER delete _normalizer! it is static 93 UErrorCode err = U_ZERO_ERROR; 94 if ( enc == "" 95 || enc == "NFC" ){ 96 _normalizer = Normalizer2::getNFCInstance( err ); 97 } 98 else if ( enc == "NONE" ){ 99 _normalizer = 0; 100 } 101 else if ( enc == "NFD" ){ 102 _normalizer = Normalizer2::getNFDInstance( err ); 103 } 104 else if ( enc == "NFKC" ){ 105 _normalizer = Normalizer2::getNFKCInstance( err ); 106 } 107 else if ( enc == "NFKD" ){ 108 _normalizer = Normalizer2::getNFKDInstance( err ); 109 } 110 else { 111 throw logic_error( "invalid normalization mode: " + enc ); 112 } 113 string tmp = mode; 114 mode = enc; 115 if ( mode.empty() ){ 116 mode = "NFC"; 117 } 118 return tmp; 119 } 120 } 121 normalize(const UnicodeString & us)122 UnicodeString UnicodeNormalizer::normalize( const UnicodeString& us ){ 123 /// normalize a UnicodeString to the current mode 124 /*! 125 \param us the UnicodeString to normalize 126 \return the UnicodeString in the correct normalization 127 */ 128 if ( _normalizer == 0 ){ 129 return us; 130 } 131 else { 132 UErrorCode status=U_ZERO_ERROR; 133 UnicodeString r = _normalizer->normalize( us, status ); 134 if (U_FAILURE(status)){ 135 throw invalid_argument("Normalizer"); 136 } 137 return r; 138 } 139 } 140 141 /// @cond HIDDEN 142 class uRegexError: public invalid_argument { 143 public: uRegexError(const string & s)144 explicit uRegexError( const string& s ): invalid_argument( "Invalid regular expression: " + s ){}; uRegexError(const UnicodeString & us)145 explicit uRegexError( const UnicodeString& us ): invalid_argument( "Invalid regular expression: " + UnicodeToUTF8(us) ){}; 146 }; 147 /// @endcond 148 Pattern() const149 UnicodeString UnicodeRegexMatcher::Pattern() const { 150 /// return the current Regex pattern 151 return pattern->pattern(); 152 } 153 UnicodeRegexMatcher(const UnicodeString & pat,const UnicodeString & name)154 UnicodeRegexMatcher::UnicodeRegexMatcher( const UnicodeString& pat, 155 const UnicodeString& name ): 156 _name(name), _debug(false) 157 { 158 /// create a RegexMatcher object 159 /*! 160 \param pat The pattern to use 161 \param name a name we give to this RegexMatcher (for error messages) 162 */ 163 matcher = NULL; 164 UErrorCode u_stat = U_ZERO_ERROR; 165 UParseError errorInfo; 166 pattern = RegexPattern::compile( pat, 0, errorInfo, u_stat ); 167 if ( U_FAILURE(u_stat) ){ 168 string spat = UnicodeToUTF8(pat); 169 string failString = UnicodeToUTF8(_name); 170 if ( errorInfo.offset >0 ){ 171 failString += " at position " + TiCC::toString( errorInfo.offset ) + "\n"; 172 UnicodeString pat1 = UnicodeString( pat, 0, errorInfo.offset -1 ); 173 failString += UnicodeToUTF8(pat1) + " <== HERE\n"; 174 } 175 else { 176 failString += "'" + spat + "' "; 177 } 178 throw uRegexError(failString); 179 } 180 else { 181 matcher = pattern->matcher( u_stat ); 182 if (U_FAILURE(u_stat)){ 183 string failString = "'" + UnicodeToUTF8(pat) + "'"; 184 throw uRegexError(failString); 185 } 186 } 187 } 188 ~UnicodeRegexMatcher()189 UnicodeRegexMatcher::~UnicodeRegexMatcher(){ 190 /// destroy a RegexMatcher 191 delete pattern; 192 delete matcher; 193 } 194 match_all(const UnicodeString & line,UnicodeString & pre,UnicodeString & post)195 bool UnicodeRegexMatcher::match_all( const UnicodeString& line, 196 UnicodeString& pre, 197 UnicodeString& post ){ 198 /// apply the RegexMatcher on an Unicode line 199 /*! 200 \param line the UnicodeString to analyze 201 \param pre the part of the line BEFORE the match, may be "" 202 \param post the part of the line AFTER the match, may be "" 203 \return true when there was some match found 204 205 if match_all returns true, you need to call get_match() to get results 206 */ 207 UErrorCode u_stat = U_ZERO_ERROR; 208 pre = ""; 209 post = ""; 210 results.clear(); 211 if ( matcher ){ 212 if ( _debug ){ 213 cerr << "start matcher [" << line << "], pattern = " << Pattern() << endl; 214 } 215 matcher->reset( line ); 216 if ( matcher->find() ){ 217 if ( _debug ){ 218 cerr << "matched " << line << endl; 219 for ( int i=0; i <= matcher->groupCount(); ++i ){ 220 cerr << "group[" << i << "] =" << matcher->group(i,u_stat) << endl; 221 } 222 } 223 if ( matcher->groupCount() == 0 ){ 224 // case 1: a rule without capture groups matches 225 UnicodeString us = matcher->group(0,u_stat) ; 226 if ( _debug ){ 227 cerr << "case 1, result = " << us << endl; 228 } 229 results.push_back( us ); 230 int start = matcher->start( 0, u_stat ); 231 if ( start > 0 ){ 232 pre = UnicodeString( line, 0, start ); 233 if ( _debug ){ 234 cerr << "found pre " << pre << endl; 235 } 236 } 237 int end = matcher->end( 0, u_stat ); 238 if ( end < line.length() ){ 239 post = UnicodeString( line, end ); 240 if ( _debug ){ 241 cerr << "found post " << post << endl; 242 } 243 } 244 return true; 245 } 246 else if ( matcher->groupCount() == 1 ){ 247 // case 2: a rule with one capture group matches 248 int start = matcher->start( 1, u_stat ); 249 if ( start >= 0 ){ 250 UnicodeString us = matcher->group(1,u_stat) ; 251 if ( _debug ){ 252 cerr << "case 2a , result = " << us << endl; 253 } 254 results.push_back( us ); 255 if ( start > 0 ){ 256 pre = UnicodeString( line, 0, start ); 257 if ( _debug ){ 258 cerr << "found pre " << pre << endl; 259 } 260 } 261 int end = matcher->end( 1, u_stat ); 262 if ( end < line.length() ){ 263 post = UnicodeString( line, end ); 264 if ( _debug ){ 265 cerr << "found post " << post << endl; 266 } 267 } 268 } 269 else { 270 // group 1 is empty, return group 0 271 UnicodeString us = matcher->group(0,u_stat) ; 272 if ( _debug ){ 273 cerr << "case 2b , result = " << us << endl; 274 } 275 results.push_back( us ); 276 start = matcher->start( 0, u_stat ); 277 if ( start > 0 ){ 278 pre = UnicodeString( line, 0, start ); 279 if ( _debug ){ 280 cerr << "found pre " << pre << endl; 281 } 282 } 283 int end = matcher->end( 0, u_stat ); 284 if ( end < line.length() ){ 285 post = UnicodeString( line, end ); 286 if ( _debug ){ 287 cerr << "found post " << post << endl; 288 } 289 } 290 } 291 return true; 292 } 293 else { 294 // a rule with more then 1 capture group 295 // this is quite ugly... 296 int end = 0; 297 for ( int i=0; i <= matcher->groupCount(); ++i ){ 298 if ( _debug ){ 299 cerr << "group " << i << endl; 300 } 301 u_stat = U_ZERO_ERROR; 302 int start = matcher->start( i, u_stat ); 303 if ( _debug ){ 304 cerr << "start = " << start << endl; 305 } 306 if ( !U_FAILURE(u_stat) ){ 307 if ( start < 0 ){ 308 continue; 309 } 310 } 311 else 312 break; 313 if ( start > end ){ 314 pre = UnicodeString( line, end, start ); 315 if ( _debug ){ 316 cerr << "found pre " << pre << endl; 317 } 318 } 319 end = matcher->end( i, u_stat ); 320 if ( _debug ){ 321 cerr << "end = " << end << endl; 322 } 323 if ( !U_FAILURE(u_stat) ){ 324 results.push_back( UnicodeString( line, start, end - start ) ); 325 if ( _debug ){ 326 cerr << "added result " << results.back() << endl; 327 } 328 } 329 else 330 break; 331 } 332 if ( end < line.length() ){ 333 post = UnicodeString( line, end ); 334 if ( _debug ){ 335 cerr << "found post " << post << endl; 336 } 337 } 338 return true; 339 } 340 } 341 } 342 results.clear(); 343 return false; 344 } 345 get_match(unsigned int n) const346 const UnicodeString UnicodeRegexMatcher::get_match( unsigned int n ) const{ 347 /// get one match from the RegexMatcher 348 /*! 349 \param n the index of the match 350 \return the match result as a UnicodeString. Returns "" when n is out 351 of range. 352 */ 353 if ( n < results.size() ){ 354 return results[n]; 355 } 356 return ""; 357 } 358 NumOfMatches() const359 int UnicodeRegexMatcher::NumOfMatches() const { 360 /// give the number of matches found. 361 if ( results.size() > 0 ){ 362 return results.size()-1; 363 } 364 return 0; 365 } 366 split(const UnicodeString & us,vector<UnicodeString> & result)367 int UnicodeRegexMatcher::split( const UnicodeString& us, 368 vector<UnicodeString>& result ){ 369 /// split a UnicodeString using the stored pattern 370 /*! 371 \param us the UnicodeString to split 372 \param result a vector with the splitted parts 373 \return the number os elements in the result 374 */ 375 result.clear(); 376 const int maxWords = 256; 377 UnicodeString words[maxWords]; 378 UErrorCode status = U_ZERO_ERROR; 379 int numWords = matcher->split( us, words, maxWords, status ); 380 for ( int i = 0; i < numWords; ++i ){ 381 result.push_back( words[i] ); 382 } 383 return numWords; 384 } 385 UniFilter()386 UniFilter::UniFilter(): _trans(0) { 387 /// create a Unicode Filter object 388 } ~UniFilter()389 UniFilter::~UniFilter(){ 390 /// destroy a Unicode Filter object 391 delete _trans; 392 } 393 get_rules() const394 UnicodeString UniFilter::get_rules() const { 395 /// extract the current rules from the Unicode Filter 396 UnicodeString result; 397 if ( !_trans ){ 398 throw runtime_error( "UniFilter::getRules(), filter not initialized." ); 399 } 400 else { 401 return _trans->toRules( result, true ); 402 } 403 } 404 init(const UnicodeString & rules,const UnicodeString & name)405 bool UniFilter::init( const UnicodeString& rules, 406 const UnicodeString& name ){ 407 /// initialize a Unicode Filter 408 /*! 409 \param rules a Unicode string with filter rules 410 \param name a name for the filter (used for error messages) 411 \return true on succes, will throw on error 412 */ 413 if ( _trans ){ 414 throw logic_error( "UniFilter::init():, filter already initialized." ); 415 } 416 UErrorCode stat = U_ZERO_ERROR; 417 UParseError err; 418 _trans = Transliterator::createFromRules( name, 419 rules, 420 UTRANS_FORWARD, 421 err, 422 stat ); 423 if ( U_FAILURE( stat ) ){ 424 string msg = "creating UniFilter: " + UnicodeToUTF8( name ) 425 + " failed\n" + "error in rules, line=" + toString(err.line) 426 + " at position: " + toString(err.offset); 427 throw runtime_error( msg ); 428 } 429 return true; 430 } 431 to_icu_rule(const UnicodeString & line)432 UnicodeString to_icu_rule( const UnicodeString& line ){ 433 /// convert an ICU Transcriptor rule or a trivial replacement into 434 /// an ICU rule 435 /*! 436 \param line the rule to convert 437 \return an Unicode representation of the rule 438 439 A rule can be an ICU Transcriptor rule like " ß > sz ;" 440 OR a simple mentioning of a symbol to be replaced " ss sz" (old_style) 441 442 The old_style variants are converted to a ICU rule. (always only just 1) 443 444 otherwise the input is just returned 'as is' 445 */ 446 bool old_style = line.indexOf( '>' ) == -1; 447 if ( old_style ){ 448 UnicodeString result; 449 bool inserted = false; 450 for ( int i=0; i < line.length(); ++i ){ 451 if ( line[i] == '`' || line[i] == '\'' || line[i] == '"' ){ 452 result += '\\'; 453 } 454 else if ( (line[i] == ' ' || line[i] == '\t' ) 455 && !inserted ){ 456 // OLD style: replace first space by a '>' symbol. 457 inserted = true; 458 result += " >"; 459 } 460 result += line[i]; 461 } 462 if ( !inserted ){ 463 // special case. line was only something like "\u00A0" or "-" 464 // which means: delete (replace by nothing) 465 result += " >"; 466 } 467 result += " ;"; 468 return result; 469 } 470 else { 471 return line; 472 } 473 } 474 fill(const string & filename,const string & label)475 bool UniFilter::fill( const string& filename, 476 const string& label ){ 477 /// fill a Unicode Filter from a file 478 /*! 479 \param filename the file to read 480 \param label a label for the filter 481 \return true on succes, will throw on erroe 482 */ 483 ifstream is( filename ); 484 if ( !is ){ 485 throw runtime_error( "UniFilter::fill(), unable te open rules file: '" 486 + filename + "'" ); 487 } 488 UnicodeString rule; 489 string line; 490 while ( getline( is, line ) ){ 491 UnicodeString uline = UnicodeFromUTF8( line ); 492 rule += to_icu_rule( uline ); 493 } 494 return init( rule, UnicodeFromUTF8(label) ); 495 } 496 filter(const UnicodeString & line)497 UnicodeString UniFilter::filter( const UnicodeString& line ){ 498 /// apply the Unicode Filter on a Unicode line 499 /*! 500 \param line the inputline 501 \return the resulting filtered line 502 */ 503 if ( !_trans ){ 504 // throw logic_error( "UniFilter not initialized." ); 505 return line; 506 } 507 else { 508 UnicodeString result = line; 509 _trans->transliterate( result ); 510 return result; 511 } 512 } 513 add(const UnicodeString & in)514 bool UniFilter::add( const UnicodeString& in ){ 515 /// add an extra rule to the Unicode Filter 516 /*! 517 \param in a rule to add 518 */ 519 // 520 // TODO: cache multiple add's and only (re-)init the transliterator 521 // once. On first use of the filter() method. 522 // caveat: Warnings about problems will be postponed too 523 // 524 UnicodeString uline = to_icu_rule( in ); 525 UnicodeString old_rules; 526 UnicodeString id = "generatedId"; 527 if ( _trans ){ 528 _trans->toRules( old_rules, false ); 529 id = _trans->getID(); 530 delete _trans; 531 _trans = 0; 532 } 533 // cerr << "OLD rule: " << old_rules << endl; 534 // cerr << "add rule: " << uline << endl; 535 old_rules += uline; 536 // cerr << "NEW rule: " << old_rules << endl; 537 // cerr << "ID = " << id << endl; 538 return init( old_rules, id ); 539 } 540 add(const string & line)541 bool UniFilter::add( const string& line ){ 542 /// add an extra rule to the Unicode Filter 543 /*! 544 \param line a UTF-8 encoded rule 545 */ 546 UnicodeString uline = UnicodeFromUTF8( line ); 547 return add( uline ); 548 } 549 operator <<(ostream & os,const UniFilter & uf)550 ostream& operator<<( ostream& os, const UniFilter& uf ){ 551 /// output the current Rules to a stream 552 os << uf.get_rules(); 553 return os; 554 } 555 filter_diacritics(const UnicodeString & in)556 UnicodeString filter_diacritics( const UnicodeString& in ) { 557 /// filter ALL diacritics from an UnicodeString 558 /*! 559 \param in the UnicodeString to filter from 560 \return an UnicodeString with all diacrytics removed 561 */ 562 static Transliterator *trans = 0; 563 if ( trans == 0 ){ 564 UErrorCode stat = U_ZERO_ERROR; 565 trans = Transliterator::createInstance( "NFD; [:M:] Remove; NFC", 566 UTRANS_FORWARD, 567 stat ); 568 if ( U_FAILURE( stat ) ){ 569 throw runtime_error( "filter_diacritics() transliterator not created" ); 570 } 571 } 572 UnicodeString result = in; 573 trans->transliterate( result ); 574 return result; 575 } 576 split_at(const UnicodeString & src,const UnicodeString & sep,size_t max)577 vector<UnicodeString> split_at( const UnicodeString& src, 578 const UnicodeString& sep, 579 size_t max ){ 580 /// split an UnicodeString 581 /*! 582 \param src the UnicodeString to split 583 \param sep the separator to split at 584 \param max limit the size off the result to max, when max > 0 585 leaving the remainder in the last part of the result 586 \return a vector with the splitted parts 587 588 \note this function skips empty entries (e.g. when two or more separators 589 co-incide) 590 */ 591 if ( sep.isEmpty() ){ 592 throw runtime_error( "TiCC::split_at(): separator is empty!" ); 593 } 594 vector<UnicodeString> results; 595 size_t cnt = 0; 596 int pos = 0; 597 while ( pos != -1 ){ 598 UnicodeString res; 599 int p = src.indexOf( sep, pos ); 600 if ( p == -1 ){ 601 res = src.tempSubString( pos ); 602 pos = p; 603 } 604 else { 605 res = src.tempSubString( pos, p - pos ); 606 pos = p + sep.length(); 607 } 608 if ( !res.isEmpty() ){ 609 ++cnt; 610 results.push_back( res ); 611 } 612 if ( max != 0 && cnt >= max-1 ){ 613 if ( pos != -1 ){ 614 results.push_back( src.tempSubString( pos ) ); 615 } 616 break; 617 } 618 } 619 return results; 620 } 621 split_exact_at(const UnicodeString & src,const UnicodeString & sep)622 vector<UnicodeString> split_exact_at( const UnicodeString& src, 623 const UnicodeString& sep ){ 624 /// split an UnicodeString 625 /*! 626 \param src the UnicodeString to split 627 \param sep the separator string to split at 628 \return a vector with the splitted parts 629 630 \note this function creates empty entries (e.g. when two or more 631 separators co-incide) 632 */ 633 if ( sep.isEmpty() ){ 634 throw runtime_error( "TiCC::split_at(): separator is empty!" ); 635 } 636 vector<UnicodeString> results; 637 int pos = 0; 638 while ( pos != -1 ){ 639 UnicodeString res; 640 int p = src.indexOf( sep, pos ); 641 if ( p == -1 ){ 642 res = src.tempSubString( pos ); 643 pos = p; 644 } 645 else { 646 res = src.tempSubString( pos, p - pos ); 647 pos = p + sep.length(); 648 } 649 results.push_back( res ); 650 } 651 return results; 652 } 653 find_first_of(const UnicodeString & src,const UnicodeString & seps,int pos)654 int find_first_of( const UnicodeString& src, 655 const UnicodeString& seps, 656 int pos ){ 657 /// find the first occurrence of one of the seps in a string 658 /*! 659 \param src the string to search 660 \param seps a list of separator characters 661 \param pos start position for the search 662 \return the position found, or -1 when not present 663 */ 664 int result = src.length()+10; 665 for ( int i=0; i < seps.length(); ++i ){ 666 int p = src.indexOf( seps[i], pos ); 667 if ( p >= 0 ){ 668 result = min( p, result ); 669 } 670 } 671 if ( result >= 0 && result < src.length() ){ 672 return result; 673 } 674 return -1; 675 } 676 split_at_first_of(const UnicodeString & src,const UnicodeString & seps,size_t max)677 vector<UnicodeString> split_at_first_of( const UnicodeString& src, 678 const UnicodeString& seps, 679 size_t max ){ 680 /// split an UnicodeString 681 /*! 682 \param src the UnicodeString to split 683 \param seps a list of separator characters 684 \param max limit the size off the result to max, when max > 0 685 leaving the remainder in the last part of the result 686 \return a vector with the splitted parts 687 688 \note this function skips empty entries (e.g. when two or more separators 689 co-incide) 690 */ 691 if ( seps.isEmpty() ){ 692 throw runtime_error( "TiCC::split_at_first_of(): separators are empty!" ); 693 } 694 vector<UnicodeString> results; 695 size_t cnt = 0; 696 int pos = 0; 697 while ( pos != -1 ){ 698 UnicodeString res; 699 int e = find_first_of( src, seps, pos ); 700 if ( e == -1 ){ 701 res = src.tempSubString( pos ); 702 pos = e; 703 } 704 else { 705 res = src.tempSubString( pos, e - pos ); 706 pos = e+1; 707 } 708 if ( !res.isEmpty() ){ 709 results.push_back( res ); 710 ++cnt; 711 } 712 if ( max != 0 && cnt >= max-1 ){ 713 if ( pos != -1 ){ 714 results.push_back( src.tempSubString( pos ) ); 715 } 716 break; 717 } 718 } 719 return results; 720 } 721 split(const UnicodeString & src,size_t max)722 vector<UnicodeString> split( const UnicodeString& src, 723 size_t max ){ 724 /// split an UnicodeString at whitespace 725 /*! 726 \param src the UnicodeString to split 727 \param max limit the size off the result to max, when max > 0 728 leaving the remainder in the last part of the result 729 \return a vector with the splitted parts 730 731 \note this function skips empty entries (e.g. when two or more separators 732 co-incide) 733 */ 734 static UnicodeString spaces = TiCC::UnicodeFromUTF8( " \r\t\n" ); 735 return split_at_first_of( src, spaces, max ); 736 } 737 split_exact_at_first_of(const UnicodeString & src,const UnicodeString & seps)738 vector<UnicodeString> split_exact_at_first_of( const UnicodeString& src, 739 const UnicodeString& seps ){ 740 /// split an UnicodeString 741 /*! 742 \param src the UnicodeString to split 743 \param seps a list of separator characters 744 \return a vector with the splitted parts 745 746 \note this function may create empty entries (e.g. when two or more 747 separators co-incide) 748 */ 749 if ( seps.isEmpty() ){ 750 throw runtime_error( "TiCC::split_at_first_of(): separators are empty!" ); 751 } 752 vector<UnicodeString> results; 753 int pos = 0; 754 while ( pos != -1 ){ 755 UnicodeString res; 756 int e = find_first_of( src, seps, pos ); 757 if ( e == -1 ){ 758 res = src.tempSubString( pos ); 759 pos = e; 760 } 761 else { 762 res = src.tempSubString( pos, e - pos ); 763 pos = e+1; 764 } 765 results.push_back( res ); // evan when empty 766 } 767 return results; 768 } 769 join(const vector<UnicodeString> & vec,const UnicodeString & sep)770 UnicodeString join( const vector<UnicodeString>& vec, 771 const UnicodeString& sep ){ 772 UnicodeString result; 773 for ( const auto& s : vec ){ 774 result += s; 775 if ( &s != &vec.back() ){ 776 result += sep; 777 } 778 } 779 return result; 780 } 781 utf8_lowercase(const string & in)782 string utf8_lowercase( const string& in ){ 783 /// convert an UTF-8 string to lowercase 784 UnicodeString us = TiCC::UnicodeFromUTF8( in ); 785 us.toLower(); 786 return TiCC::UnicodeToUTF8( us ); 787 } 788 utf8_uppercase(const string & in)789 string utf8_uppercase( const string& in ){ 790 /// convert an UTF-8 string to uppercase 791 UnicodeString us = TiCC::UnicodeFromUTF8( in ); 792 us.toUpper(); 793 return TiCC::UnicodeToUTF8( us ); 794 } 795 utrim(const UnicodeString & us,const UnicodeString & filter)796 UnicodeString utrim( const UnicodeString& us, 797 const UnicodeString& filter ){ 798 UnicodeString result; 799 int start_p = 0; 800 for ( int i=0; i < us.length(); ++i ){ 801 int pos = filter.indexOf( us[i] ); 802 if ( pos < 0 ){ 803 // not found 804 start_p = i; 805 break; 806 } 807 } 808 int end_p = us.length()-1; 809 for ( int i = end_p; i > start_p; --i ){ 810 int pos = filter.indexOf( us[i] ); 811 if ( pos < 0 ){ 812 // not found 813 end_p = i; 814 break; 815 } 816 } 817 return UnicodeString( us, start_p, end_p - start_p + 1 ); 818 } 819 ltrim(const UnicodeString & us,const UnicodeString & filter)820 UnicodeString ltrim( const UnicodeString& us, 821 const UnicodeString& filter ){ 822 UnicodeString result; 823 int start_p = 0; 824 for ( int i=0; i < us.length(); ++i ){ 825 int pos = filter.indexOf( us[i] ); 826 if ( pos < 0 ){ 827 // not found 828 start_p = i; 829 break; 830 } 831 } 832 return UnicodeString( us, start_p, us.length() - start_p + 1 ); 833 } 834 rtrim(const UnicodeString & us,const UnicodeString & filter)835 UnicodeString rtrim( const UnicodeString& us, 836 const UnicodeString& filter ){ 837 UnicodeString result; 838 int start_p = 0; 839 int end_p = us.length()-1; 840 for ( int i = end_p; i > start_p; --i ){ 841 int pos = filter.indexOf( us[i] ); 842 if ( pos < 0 ){ 843 // not found, done 844 end_p = i; 845 break; 846 } 847 } 848 return UnicodeString( us, start_p, end_p - start_p + 1 ); 849 } 850 getline(istream & is,UnicodeString & us,const char delim)851 istream& getline( istream& is, 852 UnicodeString& us, 853 const char delim ){ 854 /// read a UnicodeString from an encoded file 855 /*! 856 \param is The stream to read from 857 \param us the UnicodeString to read. (will be cleared before reading) 858 \param delim The delimiter. Default '\n' 859 \return the stream 860 */ 861 return getline( is, us, "UTF8", delim ); 862 } 863 getline(istream & is,UnicodeString & us,const string & encoding,const char delim)864 istream& getline( istream& is, 865 UnicodeString& us, 866 const string& encoding, 867 const char delim ){ 868 /// read a UnicodeString from an encoded file 869 /*! 870 \param is The stream to read from 871 \param us the UnicodeString to read. (will be cleared before reading) 872 \param encoding The Unicode encoding of the input stream. It is up to the 873 caller to assure this encoding is valid. 874 \param delim The delimiter. Default '\n' 875 \return the stream 876 */ 877 string line; 878 std::getline( is, line, delim ); 879 us = TiCC::UnicodeFromEnc( line, encoding ); 880 return is; 881 } 882 883 } 884