1 /*
2   Copyright (c) 2006 - 2021
3   CLST  - Radboud University
4   ILK   - Tilburg University
5 
6   This file is part of ticcutils
7 
8   ticcutils is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   ticcutils is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program; if not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ticcutils/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 
26 */
27 
28 #include "ticcutils/Unicode.h"
29 #include <exception>
30 #include <stdexcept>
31 #include <iostream>
32 #include <fstream>
33 #include "unicode/normalizer2.h"
34 #include "unicode/ustream.h"
35 #include "ticcutils/StringOps.h"
36 
37 using namespace std;
38 
39 namespace TiCC {
40   using namespace icu;
41 
UnicodeFromEnc(const string & s,const string & enc)42   UnicodeString UnicodeFromEnc( const string& s, const string& enc ){
43     /// convert a character buffer in some encoding to a UnicodeString
44     /*!
45       \param s the string to interpret as a character buffer
46       \param enc the encoding to use
47       \return an UnicodeString object
48     */
49     return UnicodeString( s.c_str(), s.length(), enc.c_str() );
50   }
51 
UnicodeToUTF8(const UnicodeString & s)52   string UnicodeToUTF8( const UnicodeString& s ){
53     /// convert a UnicodeString to a UTF-8 string
54     /*!
55       \param s the UnicodeString to convert
56       \return an UTF-8 encoded string
57     */
58     string result;
59     s.toUTF8String(result);
60     return result;
61   }
62 
UnicodeNormalizer(const string & enc)63   UnicodeNormalizer::UnicodeNormalizer( const string& enc ): _normalizer(0) {
64     /// create an UnicodeNormalizer object
65     /*!
66       \param enc a string describing the wanted normalization.
67       valid values are: NFC (the default), NFD, NFKC, NFKD
68     */
69     string mode = enc;
70     if ( mode.empty() ){
71       mode = "NFC";
72     }
73     setMode(mode);
74   }
75 
~UnicodeNormalizer()76   UnicodeNormalizer::~UnicodeNormalizer(){
77     /// destroy the UnicodeNormalizer
78     // NEVER EVER delete _normalizer!
79   }
80 
setMode(const string & enc)81   const string UnicodeNormalizer::setMode( const string& enc ){
82     /// set the desired normalizer mode
83     /*!
84       \param enc the new mode to set
85       \return the previous mode
86     */
87     if ( enc == mode
88 	 || (enc.empty() && mode == "NFC") ){
89       return mode;
90     }
91     else {
92       // NEVER EVER delete _normalizer! it is static
93       UErrorCode err = U_ZERO_ERROR;
94       if ( enc == ""
95 	   || enc == "NFC" ){
96 	_normalizer = Normalizer2::getNFCInstance( err );
97       }
98       else if ( enc == "NONE" ){
99 	_normalizer = 0;
100       }
101       else if ( enc == "NFD" ){
102 	_normalizer = Normalizer2::getNFDInstance( err );
103       }
104       else if ( enc == "NFKC" ){
105 	_normalizer = Normalizer2::getNFKCInstance( err );
106       }
107       else if ( enc == "NFKD" ){
108 	_normalizer = Normalizer2::getNFKDInstance( err );
109       }
110       else {
111 	throw logic_error( "invalid normalization mode: " + enc );
112       }
113       string tmp = mode;
114       mode = enc;
115       if ( mode.empty() ){
116 	mode = "NFC";
117       }
118       return tmp;
119     }
120   }
121 
normalize(const UnicodeString & us)122   UnicodeString UnicodeNormalizer::normalize( const UnicodeString& us ){
123     /// normalize a UnicodeString to the current mode
124     /*!
125       \param us the UnicodeString to normalize
126       \return the UnicodeString in the correct normalization
127     */
128     if ( _normalizer == 0 ){
129       return us;
130     }
131     else {
132       UErrorCode status=U_ZERO_ERROR;
133       UnicodeString r = _normalizer->normalize( us, status );
134       if (U_FAILURE(status)){
135 	throw invalid_argument("Normalizer");
136       }
137       return r;
138     }
139   }
140 
141   /// @cond HIDDEN
142   class uRegexError: public invalid_argument {
143   public:
uRegexError(const string & s)144     explicit uRegexError( const string& s ): invalid_argument( "Invalid regular expression: " + s ){};
uRegexError(const UnicodeString & us)145     explicit uRegexError( const UnicodeString& us ): invalid_argument( "Invalid regular expression: " + UnicodeToUTF8(us) ){};
146   };
147   /// @endcond
148 
Pattern() const149   UnicodeString UnicodeRegexMatcher::Pattern() const {
150     /// return the current Regex pattern
151     return pattern->pattern();
152   }
153 
UnicodeRegexMatcher(const UnicodeString & pat,const UnicodeString & name)154   UnicodeRegexMatcher::UnicodeRegexMatcher( const UnicodeString& pat,
155 					    const UnicodeString& name ):
156     _name(name), _debug(false)
157   {
158     /// create a RegexMatcher object
159     /*!
160       \param pat The pattern to use
161       \param name a name we give to this RegexMatcher (for error messages)
162     */
163     matcher = NULL;
164     UErrorCode u_stat = U_ZERO_ERROR;
165     UParseError errorInfo;
166     pattern = RegexPattern::compile( pat, 0, errorInfo, u_stat );
167     if ( U_FAILURE(u_stat) ){
168       string spat = UnicodeToUTF8(pat);
169       string failString = UnicodeToUTF8(_name);
170       if ( errorInfo.offset >0 ){
171 	failString += " at position " + TiCC::toString( errorInfo.offset ) + "\n";
172 	UnicodeString pat1 = UnicodeString( pat, 0, errorInfo.offset -1 );
173 	failString += UnicodeToUTF8(pat1) + " <== HERE\n";
174       }
175       else {
176 	failString += "'" + spat + "' ";
177       }
178       throw uRegexError(failString);
179     }
180     else {
181       matcher = pattern->matcher( u_stat );
182       if (U_FAILURE(u_stat)){
183 	string failString = "'" + UnicodeToUTF8(pat) + "'";
184 	throw uRegexError(failString);
185       }
186     }
187   }
188 
~UnicodeRegexMatcher()189   UnicodeRegexMatcher::~UnicodeRegexMatcher(){
190     /// destroy a RegexMatcher
191     delete pattern;
192     delete matcher;
193   }
194 
match_all(const UnicodeString & line,UnicodeString & pre,UnicodeString & post)195   bool UnicodeRegexMatcher::match_all( const UnicodeString& line,
196 				       UnicodeString& pre,
197 				       UnicodeString& post ){
198     /// apply the RegexMatcher on an Unicode line
199     /*!
200       \param line the UnicodeString to analyze
201       \param pre the part of the line BEFORE the match, may be ""
202       \param post the part of the line AFTER the match, may be ""
203       \return true when there was some match found
204 
205       if match_all returns true, you need to call get_match() to get results
206     */
207     UErrorCode u_stat = U_ZERO_ERROR;
208     pre = "";
209     post = "";
210     results.clear();
211     if ( matcher ){
212       if ( _debug ){
213 	cerr << "start matcher [" << line << "], pattern = " << Pattern() << endl;
214       }
215       matcher->reset( line );
216       if ( matcher->find() ){
217 	if ( _debug ){
218 	  cerr << "matched " << line << endl;
219 	  for ( int i=0; i <= matcher->groupCount(); ++i ){
220 	    cerr << "group[" << i << "] =" << matcher->group(i,u_stat) << endl;
221 	  }
222 	}
223 	if ( matcher->groupCount() == 0 ){
224 	  // case 1: a rule without capture groups matches
225 	  UnicodeString us = matcher->group(0,u_stat) ;
226 	  if ( _debug ){
227 	    cerr << "case 1, result = " << us << endl;
228 	  }
229 	  results.push_back( us );
230 	  int start = matcher->start( 0, u_stat );
231 	  if ( start > 0 ){
232 	    pre = UnicodeString( line, 0, start );
233 	    if ( _debug ){
234 	      cerr << "found pre " << pre << endl;
235 	    }
236 	  }
237 	  int end = matcher->end( 0, u_stat );
238 	  if ( end < line.length() ){
239 	    post = UnicodeString( line, end );
240 	    if ( _debug ){
241 	      cerr << "found post " << post << endl;
242 	    }
243 	  }
244 	  return true;
245 	}
246 	else if ( matcher->groupCount() == 1 ){
247 	  // case 2: a rule with one capture group matches
248 	  int start = matcher->start( 1, u_stat );
249 	  if ( start >= 0 ){
250 	    UnicodeString us = matcher->group(1,u_stat) ;
251 	    if ( _debug ){
252 	      cerr << "case 2a , result = " << us << endl;
253 	    }
254 	    results.push_back( us );
255 	    if ( start > 0 ){
256 	      pre = UnicodeString( line, 0, start );
257 	      if ( _debug ){
258 		cerr << "found pre " << pre << endl;
259 	      }
260 	    }
261 	    int end = matcher->end( 1, u_stat );
262 	    if ( end < line.length() ){
263 	      post = UnicodeString( line, end );
264 	      if ( _debug ){
265 		cerr << "found post " << post << endl;
266 	      }
267 	    }
268 	  }
269 	  else {
270 	    // group 1 is empty, return group 0
271 	    UnicodeString us = matcher->group(0,u_stat) ;
272 	    if ( _debug ){
273 	      cerr << "case 2b , result = " << us << endl;
274 	    }
275 	    results.push_back( us );
276 	    start = matcher->start( 0, u_stat );
277 	    if ( start > 0 ){
278 	      pre = UnicodeString( line, 0, start );
279 	      if ( _debug ){
280 		cerr << "found pre " << pre << endl;
281 	      }
282 	    }
283 	    int end = matcher->end( 0, u_stat );
284 	    if ( end < line.length() ){
285 	      post = UnicodeString( line, end );
286 	      if ( _debug ){
287 		cerr << "found post " << post << endl;
288 	      }
289 	    }
290 	  }
291 	  return true;
292 	}
293 	else {
294 	  // a rule with more then 1 capture group
295 	  // this is quite ugly...
296 	  int end = 0;
297 	  for ( int i=0; i <= matcher->groupCount(); ++i ){
298 	    if ( _debug ){
299 	      cerr << "group " << i << endl;
300 	    }
301 	    u_stat = U_ZERO_ERROR;
302 	    int start = matcher->start( i, u_stat );
303 	    if ( _debug ){
304 	      cerr << "start = " << start << endl;
305 	    }
306 	    if ( !U_FAILURE(u_stat) ){
307 	      if ( start < 0 ){
308 		continue;
309 	      }
310 	    }
311 	    else
312 	      break;
313 	    if ( start > end ){
314 	      pre = UnicodeString( line, end, start );
315 	      if ( _debug ){
316 		cerr << "found pre " << pre << endl;
317 	      }
318 	    }
319 	    end = matcher->end( i, u_stat );
320 	    if ( _debug ){
321 	      cerr << "end = " << end << endl;
322 	    }
323 	    if ( !U_FAILURE(u_stat) ){
324 	      results.push_back( UnicodeString( line, start, end - start ) );
325 	      if ( _debug ){
326 		cerr << "added result " << results.back() << endl;
327 	      }
328 	    }
329 	    else
330 	      break;
331 	  }
332 	  if ( end < line.length() ){
333 	    post = UnicodeString( line, end );
334 	    if ( _debug ){
335 	      cerr << "found post " << post << endl;
336 	    }
337 	  }
338 	  return true;
339 	}
340       }
341     }
342     results.clear();
343     return false;
344   }
345 
get_match(unsigned int n) const346   const UnicodeString UnicodeRegexMatcher::get_match( unsigned int n ) const{
347     /// get one match from the RegexMatcher
348     /*!
349       \param n the index of the match
350       \return the match result as a UnicodeString. Returns "" when n is out
351       of range.
352     */
353     if ( n < results.size() ){
354       return results[n];
355     }
356     return "";
357   }
358 
NumOfMatches() const359   int UnicodeRegexMatcher::NumOfMatches() const {
360     /// give the number of matches found.
361     if ( results.size() > 0 ){
362       return results.size()-1;
363     }
364     return 0;
365   }
366 
split(const UnicodeString & us,vector<UnicodeString> & result)367   int UnicodeRegexMatcher::split( const UnicodeString& us,
368 				  vector<UnicodeString>& result ){
369     /// split a UnicodeString using the stored pattern
370     /*!
371       \param us the UnicodeString to split
372       \param result a vector with the splitted parts
373       \return the number os elements in the result
374     */
375     result.clear();
376     const int maxWords = 256;
377     UnicodeString words[maxWords];
378     UErrorCode status = U_ZERO_ERROR;
379     int numWords = matcher->split( us, words, maxWords, status );
380     for ( int i = 0; i < numWords; ++i ){
381       result.push_back( words[i] );
382     }
383     return numWords;
384   }
385 
UniFilter()386   UniFilter::UniFilter(): _trans(0) {
387     /// create a Unicode Filter object
388   }
~UniFilter()389   UniFilter::~UniFilter(){
390     /// destroy a Unicode Filter object
391     delete _trans;
392   }
393 
get_rules() const394   UnicodeString UniFilter::get_rules() const {
395     /// extract the current rules from the Unicode Filter
396     UnicodeString result;
397     if ( !_trans ){
398       throw runtime_error( "UniFilter::getRules(), filter not initialized." );
399     }
400     else {
401       return _trans->toRules( result, true );
402     }
403   }
404 
init(const UnicodeString & rules,const UnicodeString & name)405   bool UniFilter::init( const UnicodeString& rules,
406 			const UnicodeString& name ){
407     /// initialize a Unicode Filter
408     /*!
409       \param rules a Unicode string with filter rules
410       \param name a name for the filter (used for error messages)
411       \return true on succes, will throw on error
412     */
413     if ( _trans ){
414       throw logic_error( "UniFilter::init():, filter already initialized." );
415     }
416     UErrorCode stat = U_ZERO_ERROR;
417     UParseError err;
418     _trans = Transliterator::createFromRules( name,
419 						   rules,
420 						   UTRANS_FORWARD,
421 						   err,
422 						   stat );
423     if ( U_FAILURE( stat ) ){
424       string msg = "creating UniFilter: " + UnicodeToUTF8( name )
425 	+ " failed\n" + "error in rules, line=" + toString(err.line)
426 	+ " at position: " + toString(err.offset);
427       throw runtime_error( msg );
428     }
429     return true;
430   }
431 
to_icu_rule(const UnicodeString & line)432   UnicodeString to_icu_rule( const UnicodeString& line ){
433     /// convert an ICU Transcriptor rule or a trivial replacement into
434     /// an ICU rule
435     /*!
436       \param line the rule to convert
437       \return an Unicode representation of the rule
438 
439       A rule can be an ICU Transcriptor rule like " ß > sz ;"
440       OR a simple mentioning of a symbol to be replaced " ss sz" (old_style)
441 
442       The old_style variants are converted to a ICU rule. (always only just 1)
443 
444       otherwise the input is just returned 'as is'
445     */
446     bool old_style = line.indexOf( '>' ) == -1;
447     if ( old_style ){
448       UnicodeString result;
449       bool inserted = false;
450       for ( int i=0; i < line.length(); ++i ){
451 	if ( line[i] == '`' || line[i] == '\'' || line[i] == '"' ){
452 	  result += '\\';
453 	}
454 	else if ( (line[i] == ' ' || line[i] == '\t' )
455 		  && !inserted ){
456 	  // OLD style: replace first space by a '>' symbol.
457 	  inserted = true;
458 	  result += " >";
459 	}
460 	result += line[i];
461       }
462       if ( !inserted ){
463 	// special case. line was only something like "\u00A0" or "-"
464 	// which means: delete (replace by nothing)
465 	result  += " >";
466       }
467       result += " ;";
468       return result;
469     }
470     else {
471       return line;
472     }
473   }
474 
fill(const string & filename,const string & label)475   bool UniFilter::fill( const string& filename,
476 			const string& label ){
477     /// fill a Unicode Filter from a file
478     /*!
479       \param filename the file to read
480       \param label a label for the filter
481       \return true on succes, will throw on erroe
482     */
483     ifstream is( filename );
484     if ( !is ){
485       throw runtime_error( "UniFilter::fill(), unable te open rules file: '"
486 			   + filename + "'" );
487     }
488     UnicodeString rule;
489     string line;
490     while ( getline( is, line ) ){
491       UnicodeString uline = UnicodeFromUTF8( line );
492       rule += to_icu_rule( uline );
493     }
494     return init( rule, UnicodeFromUTF8(label) );
495   }
496 
filter(const UnicodeString & line)497   UnicodeString UniFilter::filter( const UnicodeString& line ){
498     /// apply the Unicode Filter on a Unicode line
499     /*!
500       \param line the inputline
501       \return the resulting filtered line
502     */
503     if ( !_trans ){
504       //      throw logic_error( "UniFilter not initialized." );
505       return line;
506     }
507     else {
508       UnicodeString result = line;
509       _trans->transliterate( result );
510       return result;
511     }
512   }
513 
add(const UnicodeString & in)514   bool UniFilter::add( const UnicodeString& in ){
515     /// add an extra rule to the Unicode Filter
516     /*!
517       \param in a rule to add
518     */
519     //
520     // TODO: cache multiple add's and only (re-)init the transliterator
521     //       once. On first use of the filter() method.
522     //       caveat: Warnings about problems will be postponed too
523     //
524     UnicodeString uline = to_icu_rule( in );
525     UnicodeString old_rules;
526     UnicodeString id = "generatedId";
527     if ( _trans ){
528       _trans->toRules( old_rules, false );
529       id = _trans->getID();
530       delete _trans;
531       _trans = 0;
532     }
533     // cerr << "OLD rule: " << old_rules << endl;
534     // cerr << "add rule: " << uline << endl;
535     old_rules += uline;
536     // cerr << "NEW rule: " << old_rules << endl;
537     // cerr << "ID = " << id << endl;
538     return init( old_rules, id );
539   }
540 
add(const string & line)541   bool UniFilter::add( const string& line ){
542     /// add an extra rule to the Unicode Filter
543     /*!
544       \param line a UTF-8 encoded rule
545     */
546     UnicodeString uline = UnicodeFromUTF8( line );
547     return add( uline );
548   }
549 
operator <<(ostream & os,const UniFilter & uf)550   ostream& operator<<( ostream& os, const UniFilter& uf ){
551     /// output the current Rules to a stream
552     os << uf.get_rules();
553     return os;
554   }
555 
filter_diacritics(const UnicodeString & in)556   UnicodeString filter_diacritics( const UnicodeString& in ) {
557     /// filter ALL diacritics from an UnicodeString
558     /*!
559       \param in the UnicodeString to filter from
560       \return an UnicodeString with all diacrytics removed
561     */
562     static Transliterator *trans = 0;
563     if ( trans == 0 ){
564       UErrorCode stat = U_ZERO_ERROR;
565       trans = Transliterator::createInstance( "NFD; [:M:] Remove; NFC",
566 						   UTRANS_FORWARD,
567 						   stat );
568       if ( U_FAILURE( stat ) ){
569 	throw runtime_error( "filter_diacritics()  transliterator not created" );
570       }
571     }
572     UnicodeString result = in;
573     trans->transliterate( result );
574     return result;
575   }
576 
split_at(const UnicodeString & src,const UnicodeString & sep,size_t max)577   vector<UnicodeString> split_at( const UnicodeString& src,
578 				  const UnicodeString& sep,
579 				  size_t max ){
580     /// split an UnicodeString
581     /*!
582       \param src the UnicodeString to split
583       \param sep the separator to split at
584       \param max limit the size off the result to max, when max > 0
585       leaving the remainder in the last part of the result
586       \return a vector with the splitted parts
587 
588       \note this function skips empty entries (e.g. when two or more separators
589       co-incide)
590     */
591     if ( sep.isEmpty() ){
592       throw runtime_error( "TiCC::split_at(): separator is empty!" );
593     }
594     vector<UnicodeString> results;
595     size_t cnt = 0;
596     int pos = 0;
597     while ( pos != -1 ){
598       UnicodeString res;
599       int p = src.indexOf( sep, pos );
600       if ( p == -1 ){
601 	res = src.tempSubString( pos );
602 	pos = p;
603       }
604       else {
605 	res = src.tempSubString( pos, p - pos );
606 	pos = p + sep.length();
607       }
608       if ( !res.isEmpty() ){
609 	++cnt;
610 	results.push_back( res );
611       }
612       if ( max != 0 && cnt >= max-1 ){
613 	if ( pos != -1 ){
614 	  results.push_back( src.tempSubString( pos ) );
615 	}
616 	break;
617       }
618     }
619     return results;
620   }
621 
split_exact_at(const UnicodeString & src,const UnicodeString & sep)622   vector<UnicodeString> split_exact_at( const UnicodeString& src,
623 					const UnicodeString& sep ){
624     /// split an UnicodeString
625     /*!
626       \param src the UnicodeString to split
627       \param sep the separator string to split at
628       \return a vector with the splitted parts
629 
630       \note this function creates empty entries (e.g. when two or more
631       separators co-incide)
632     */
633     if ( sep.isEmpty() ){
634       throw runtime_error( "TiCC::split_at(): separator is empty!" );
635     }
636     vector<UnicodeString> results;
637     int pos = 0;
638     while ( pos != -1 ){
639       UnicodeString res;
640       int p = src.indexOf( sep, pos );
641       if ( p == -1 ){
642 	res = src.tempSubString( pos );
643 	pos = p;
644       }
645       else {
646 	res = src.tempSubString( pos, p - pos );
647 	pos = p + sep.length();
648       }
649       results.push_back( res );
650     }
651     return results;
652   }
653 
find_first_of(const UnicodeString & src,const UnicodeString & seps,int pos)654   int find_first_of( const UnicodeString& src,
655 		     const UnicodeString& seps,
656 		     int pos ){
657     /// find the first occurrence of one of the seps in a string
658     /*!
659       \param src the string to search
660       \param seps a list of separator characters
661       \param pos start position for the search
662       \return the position found, or -1 when not present
663     */
664     int result = src.length()+10;
665     for ( int i=0; i < seps.length(); ++i ){
666       int p = src.indexOf( seps[i], pos );
667       if ( p >= 0 ){
668 	result = min( p, result );
669       }
670     }
671     if ( result >= 0 && result < src.length() ){
672       return result;
673     }
674     return -1;
675   }
676 
split_at_first_of(const UnicodeString & src,const UnicodeString & seps,size_t max)677   vector<UnicodeString> split_at_first_of( const UnicodeString& src,
678 					   const UnicodeString& seps,
679 					   size_t max ){
680     /// split an UnicodeString
681     /*!
682       \param src the UnicodeString to split
683       \param seps a list of separator characters
684       \param max limit the size off the result to max, when max > 0
685       leaving the remainder in the last part of the result
686       \return a vector with the splitted parts
687 
688       \note this function skips empty entries (e.g. when two or more separators
689       co-incide)
690     */
691     if ( seps.isEmpty() ){
692       throw runtime_error( "TiCC::split_at_first_of(): separators are empty!" );
693     }
694     vector<UnicodeString> results;
695     size_t cnt = 0;
696     int pos = 0;
697     while ( pos != -1 ){
698       UnicodeString res;
699       int e = find_first_of( src, seps, pos );
700       if ( e == -1 ){
701 	res = src.tempSubString( pos );
702 	pos = e;
703       }
704       else {
705 	res = src.tempSubString( pos, e - pos );
706 	pos = e+1;
707       }
708       if ( !res.isEmpty() ){
709 	results.push_back( res );
710 	++cnt;
711       }
712       if ( max != 0 && cnt >= max-1 ){
713 	if ( pos != -1 ){
714 	  results.push_back( src.tempSubString( pos ) );
715 	}
716 	break;
717       }
718     }
719     return results;
720   }
721 
split(const UnicodeString & src,size_t max)722   vector<UnicodeString> split( const UnicodeString& src,
723 			       size_t max ){
724     /// split an UnicodeString at whitespace
725     /*!
726       \param src the UnicodeString to split
727       \param max limit the size off the result to max, when max > 0
728       leaving the remainder in the last part of the result
729       \return a vector with the splitted parts
730 
731       \note this function skips empty entries (e.g. when two or more separators
732       co-incide)
733     */
734     static UnicodeString spaces = TiCC::UnicodeFromUTF8( " \r\t\n" );
735     return split_at_first_of( src, spaces, max );
736   }
737 
split_exact_at_first_of(const UnicodeString & src,const UnicodeString & seps)738   vector<UnicodeString> split_exact_at_first_of( const UnicodeString& src,
739 						 const UnicodeString& seps ){
740     /// split an UnicodeString
741     /*!
742       \param src the UnicodeString to split
743       \param seps a list of separator characters
744       \return a vector with the splitted parts
745 
746       \note this function may create empty entries (e.g. when two or more
747       separators co-incide)
748     */
749     if ( seps.isEmpty() ){
750       throw runtime_error( "TiCC::split_at_first_of(): separators are empty!" );
751     }
752     vector<UnicodeString> results;
753     int pos = 0;
754     while ( pos != -1 ){
755       UnicodeString res;
756       int e = find_first_of( src, seps, pos );
757       if ( e == -1 ){
758 	res = src.tempSubString( pos );
759 	pos = e;
760       }
761       else {
762 	res = src.tempSubString( pos, e - pos );
763 	pos = e+1;
764       }
765       results.push_back( res ); // evan when empty
766     }
767     return results;
768   }
769 
join(const vector<UnicodeString> & vec,const UnicodeString & sep)770   UnicodeString join( const vector<UnicodeString>& vec,
771 		     const UnicodeString& sep ){
772     UnicodeString result;
773     for ( const auto& s : vec ){
774       result += s;
775       if ( &s != &vec.back() ){
776 	result += sep;
777       }
778     }
779     return result;
780   }
781 
utf8_lowercase(const string & in)782   string utf8_lowercase( const string& in ){
783     /// convert an UTF-8 string to lowercase
784     UnicodeString us = TiCC::UnicodeFromUTF8( in );
785     us.toLower();
786     return TiCC::UnicodeToUTF8( us );
787   }
788 
utf8_uppercase(const string & in)789   string utf8_uppercase( const string& in ){
790     /// convert an UTF-8 string to uppercase
791     UnicodeString us = TiCC::UnicodeFromUTF8( in );
792     us.toUpper();
793     return TiCC::UnicodeToUTF8( us );
794   }
795 
utrim(const UnicodeString & us,const UnicodeString & filter)796   UnicodeString utrim( const UnicodeString& us,
797 		       const UnicodeString& filter ){
798     UnicodeString result;
799     int start_p = 0;
800     for ( int i=0; i < us.length(); ++i ){
801       int pos = filter.indexOf( us[i] );
802       if ( pos < 0 ){
803 	// not found
804 	start_p = i;
805 	break;
806       }
807     }
808     int end_p = us.length()-1;
809     for ( int i = end_p; i > start_p; --i ){
810       int pos = filter.indexOf( us[i] );
811       if ( pos < 0 ){
812 	// not found
813 	end_p = i;
814 	break;
815       }
816     }
817     return UnicodeString( us, start_p, end_p - start_p + 1 );
818   }
819 
ltrim(const UnicodeString & us,const UnicodeString & filter)820   UnicodeString ltrim( const UnicodeString& us,
821 		       const UnicodeString& filter ){
822     UnicodeString result;
823     int start_p = 0;
824     for ( int i=0; i < us.length(); ++i ){
825       int pos = filter.indexOf( us[i] );
826       if ( pos < 0 ){
827 	// not found
828 	start_p = i;
829 	break;
830       }
831     }
832     return UnicodeString( us, start_p, us.length() - start_p + 1 );
833   }
834 
rtrim(const UnicodeString & us,const UnicodeString & filter)835   UnicodeString rtrim( const UnicodeString& us,
836 		       const UnicodeString& filter ){
837     UnicodeString result;
838     int start_p = 0;
839     int end_p = us.length()-1;
840     for ( int i = end_p; i > start_p; --i ){
841       int pos = filter.indexOf( us[i] );
842       if ( pos < 0 ){
843 	// not found, done
844 	end_p = i;
845 	break;
846       }
847     }
848     return UnicodeString( us, start_p, end_p - start_p + 1 );
849   }
850 
getline(istream & is,UnicodeString & us,const char delim)851   istream& getline( istream& is,
852 		    UnicodeString& us,
853 		    const char delim ){
854     /// read a UnicodeString from an encoded file
855     /*!
856       \param is The stream to read from
857       \param us the UnicodeString to read. (will be cleared before reading)
858       \param delim The delimiter. Default '\n'
859       \return the stream
860     */
861     return getline( is, us, "UTF8", delim );
862   }
863 
getline(istream & is,UnicodeString & us,const string & encoding,const char delim)864   istream& getline( istream& is,
865 		    UnicodeString& us,
866 		    const string& encoding,
867 		    const char delim ){
868     /// read a UnicodeString from an encoded file
869     /*!
870       \param is The stream to read from
871       \param us the UnicodeString to read. (will be cleared before reading)
872       \param encoding The Unicode encoding of the input stream. It is up to the
873       caller to assure this encoding is valid.
874       \param delim The delimiter. Default '\n'
875       \return the stream
876     */
877     string line;
878     std::getline( is, line, delim );
879     us = TiCC::UnicodeFromEnc( line, encoding );
880     return is;
881   }
882 
883 }
884