1 /* 2 Copyright (c) 2006 - 2021 3 CLST - Radboud University 4 ILK - Tilburg University 5 6 This file is part of ticcutils 7 8 ticcutils is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or 11 (at your option) any later version. 12 13 ticcutils is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, see <http://www.gnu.org/licenses/>. 20 21 For questions and suggestions, see: 22 https://github.com/LanguageMachines/ticcutils/issues 23 or send mail to: 24 lamasoftware (at ) science.ru.nl 25 26 */ 27 28 #include "ticcutils/StringOps.h" 29 30 #include <cerrno> 31 #include <cfloat> 32 #include <cstdlib> 33 #include <algorithm> 34 #include <string> 35 #include <iostream> 36 #include <sstream> 37 #include "ticcutils/Version.h" 38 39 using namespace std; 40 namespace TiCC { 41 BuildInfo()42 string BuildInfo() { 43 // cannot be defined in the header because otherwise __DATE__ and 44 // __TIME__ would be dynamic. (changing every time it is included) 45 return VersionName() + "-" + Version() + ". Compiled on " 46 + __DATE__ + " " + __TIME__; 47 } 48 trim(const string & s,const string & chars)49 string trim( const string& s, const string& chars ){ 50 /// remove leading and trailing characters from a string 51 /*! 52 \param s the string to trim 53 \param chars the characters to remove. The default is whitespace. 54 When \e chars = "", the following chars are used: " \t\r\n" 55 */ 56 string result; 57 if ( !s.empty() ){ 58 string::size_type b_pos = s.find_first_not_of( chars ); 59 if ( b_pos == string::npos ){ 60 return result; // 'empty' string. only garbage 61 } 62 string::size_type e_pos = s.find_last_not_of( chars ); 63 if ( e_pos == string::npos ){ 64 result = s.substr( b_pos ); 65 } 66 else { 67 result = s.substr( b_pos, e_pos-b_pos+1 ); 68 } 69 } 70 return result; 71 } 72 trim_front(const string & s,const string & chars)73 string trim_front( const string& s, const string& chars ){ 74 /// remove leading characters from a string 75 /*! 76 \param s the string to trim 77 \param chars the characters to remove. The default is whitespace. 78 When \e chars = "", the following chars are used: " \t\r\n" 79 */ 80 string result; 81 if ( !s.empty() ){ 82 string::size_type b_pos = s.find_first_not_of( chars ); 83 if ( b_pos != string::npos ){ 84 result = s.substr( b_pos ); 85 } 86 } 87 return result; 88 } 89 trim_back(const string & s,const string & chars)90 string trim_back( const string& s, const string& chars ){ 91 /// remove trailing characters from a string 92 /*! 93 \param s the string to trim 94 \param chars the characters to remove. The default is whitespace. 95 When \e chars = "", the following chars are used: " \t\r\n" 96 */ 97 string result; 98 if ( !s.empty() ){ 99 string::size_type e_pos = s.find_last_not_of( chars ); 100 if ( e_pos != string::npos ){ 101 result = s.substr( 0, e_pos+1 ); 102 } 103 } 104 return result; 105 } 106 toLower(const int & i)107 static int toLower( const int& i ){ return tolower(i); } toUpper(const int & i)108 static int toUpper( const int& i ){ return toupper(i); } 109 to_lower(string & s)110 void to_lower( string& s ){ 111 /// convert to all lowercase. Modifies the input. 112 transform( s.begin(), s.end(), s.begin(), toLower ); 113 } 114 to_upper(string & s)115 void to_upper( string& s ){ 116 /// convert to all uppercase. Modifies the input. 117 transform( s.begin(), s.end(), s.begin(), toUpper ); 118 } 119 lowercase(const string & s)120 string lowercase( const string& s ){ 121 /// return a lowercased copy of the inputstring 122 string result = s; 123 to_lower( result ); 124 return result; 125 } 126 uppercase(const string & s)127 string uppercase( const string& s ){ 128 /// return a uppercased copy of the inputstring 129 string result = s; 130 to_upper( result ); 131 return result; 132 } 133 match_back(const std::string & s,const std::string & tail)134 bool match_back( const std::string& s, const std::string& tail ){ 135 /// check if a string matches another at the back 136 /*! 137 \param s the string 138 \param tail the string we search in \e s 139 \return true is \e tail is the last pasrt of \e s 140 */ 141 int res = -2; 142 try { 143 res = s.compare( s.length() - tail.length(), tail.length(), tail ); 144 } 145 catch ( ... ){ 146 } 147 if ( res == 0 ){ 148 return true; 149 } 150 return false; 151 } 152 match_front(const std::string & s,const std::string & head)153 bool match_front( const std::string& s, const std::string& head ){ 154 /// check if a string matches another at the front 155 /*! 156 \param s the string 157 \param head the string we search in \e s 158 \return true is \e head is the first part of \e s 159 */ 160 int res = s.compare(0,head.length(),head); 161 if ( res == 0 ){ 162 return true; 163 } 164 return false; 165 } 166 local_split_at(const string & src,const string & sep,bool exact)167 static vector<string> local_split_at( const string& src, 168 const string& sep, 169 bool exact ){ 170 /// split a string into substrings. 171 /*! 172 \param src the string to split 173 \param sep a separator string. This may be a multi-character string. 174 \param exact normally, we silently skip empty entries (e.g. when two or 175 more separators co-incide), but not when exact=true. In that case result 176 may contain empty strings. 177 \return a vector of substrings 178 */ 179 if ( sep.empty() ){ 180 throw runtime_error( "TiCC::split_at(): separator is empty!" ); 181 } 182 vector<string> results; 183 string::size_type pos = 0; 184 while ( pos != string::npos ){ 185 string res; 186 string::size_type p = src.find( sep, pos ); 187 if ( p == string::npos ){ 188 res = src.substr( pos ); 189 pos = p; 190 } 191 else { 192 res = src.substr( pos, p - pos ); 193 pos = p + sep.length(); 194 } 195 if ( !res.empty() || exact ){ 196 results.push_back( res ); 197 } 198 } 199 return results; 200 } 201 local_split_at(const string & src,const string & sep,size_t max)202 static vector<string> local_split_at( const string& src, 203 const string& sep, 204 size_t max ){ 205 /// split a string into substrings. 206 /*! 207 \param src the string to split 208 \param sep a separator string. May be a multi-character string. 209 \param max if max > 0, limit the size of the result to \e max strings, 210 leaving the remainder in the last part of the result 211 \return a vector of split parts 212 */ 213 if ( sep.empty() ){ 214 throw runtime_error( "TiCC::split_at(): separator is empty!" ); 215 } 216 vector<string> results; 217 size_t cnt = 0; 218 string::size_type pos = 0; 219 while ( pos != string::npos ){ 220 string res; 221 string::size_type p = src.find( sep, pos ); 222 if ( p == string::npos ){ 223 res = src.substr( pos ); 224 pos = p; 225 } 226 else { 227 res = src.substr( pos, p - pos ); 228 pos = p + sep.length(); 229 } 230 if ( !res.empty() ){ 231 ++cnt; 232 results.push_back( res ); 233 } 234 if ( max != 0 && cnt >= max-1 ){ 235 if ( pos != string::npos ){ 236 results.push_back( src.substr( pos ) ); 237 } 238 break; 239 } 240 } 241 return results; 242 } 243 local_split_at_first_of(const string & src,const string & seps,bool exact)244 static vector<string> local_split_at_first_of( const string& src, 245 const string& seps, 246 bool exact ){ 247 /// split a string into substrings. 248 /*! 249 \param src the string to split 250 \param seps a string with separator characters. one of those should match 251 for a split to happen. 252 \param exact normally, we silently skip empty entries (e.g. when two or 253 more separators co-incide), but not when exact=true. In that case result 254 may contain empty strings. 255 \return a vector of split parts 256 */ 257 if ( seps.empty() ){ 258 throw runtime_error( "TiCC::split_at_first_of(): separators are empty!" ); 259 } 260 vector<string> results; 261 string::size_type s = 0; 262 while ( s != string::npos ){ 263 string res; 264 string::size_type e = src.find_first_of( seps, s ); 265 if ( e == string::npos ){ 266 res = src.substr( s ); 267 s = e; 268 } 269 else { 270 res = src.substr( s, e - s ); 271 s = e+1; 272 } 273 if ( !res.empty() || exact ){ 274 results.push_back( res ); 275 } 276 } 277 return results; 278 } 279 local_split_at_first_of(const string & src,const string & seps,size_t max)280 static vector<string> local_split_at_first_of( const string& src, 281 const string& seps, 282 size_t max ){ 283 /// split a string into substrings. 284 /*! 285 \param src the string to split 286 \param seps a string with separator characters. one of those should match 287 for a split to happen. 288 \param max if max > 0, limit the size of the result to \e max strings, 289 leaving the remainder in the last part of the result 290 \return a vector of split parts 291 */ 292 if ( seps.empty() ){ 293 throw runtime_error( "TiCC::split_at_first_of(): separators are empty!" ); 294 } 295 vector<string> results; 296 size_t cnt = 0; 297 string::size_type pos = 0; 298 while ( pos != string::npos ){ 299 string res; 300 string::size_type e = src.find_first_of( seps, pos ); 301 if ( e == string::npos ){ 302 res = src.substr( pos ); 303 pos = e; 304 } 305 else { 306 res = src.substr( pos, e - pos ); 307 pos = e+1; 308 } 309 if ( !res.empty() ){ 310 results.push_back( res ); 311 ++cnt; 312 } 313 if ( max != 0 && cnt >= max-1 ){ 314 if ( pos != string::npos ){ 315 results.push_back( src.substr( pos ) ); 316 } 317 break; 318 } 319 } 320 return results; 321 } 322 split_at(const string & src,const string & sep,size_t max)323 vector<string> split_at( const string& src, 324 const string& sep, 325 size_t max ){ 326 /// split a string into substrings. 327 /*! 328 \param src the string to split 329 \param sep a separator string. May be a multi-character string. 330 \param max if max > 0, limit the size of the result to \e max strings, 331 leaving the remainder in the last part of the result 332 \return a vector of split parts 333 */ 334 return local_split_at( src, sep, max ); 335 } 336 split_at(const std::string & s,std::vector<std::string> & v,const std::string & seps)337 size_t split_at( const std::string& s, 338 std::vector<std::string>& v, 339 const std::string& seps ){ 340 v = local_split_at( s, seps, false ); 341 return v.size(); 342 } 343 split(const std::string & s,size_t num)344 std::vector<std::string> split( const std::string& s, 345 size_t num ){ 346 return local_split_at_first_of( s, " \r\t\n", num ); 347 } 348 split_at_first_of(const string & s,const string & seps,size_t num)349 vector<string> split_at_first_of( const string& s, 350 const string& seps, 351 size_t num ){ 352 return local_split_at_first_of( s, seps, num ); 353 } 354 split_at_first_of(const std::string & s,std::vector<std::string> & v,const std::string & seps)355 size_t split_at_first_of( const std::string& s, 356 std::vector<std::string>& v, 357 const std::string& seps ){ 358 v = local_split_at_first_of( s, seps, false ); 359 return v.size(); 360 } 361 split(const std::string & s,std::vector<std::string> & v)362 size_t split( const std::string& s, 363 std::vector<std::string>& v ){ 364 v = local_split_at_first_of( s, " \r\t\n", false ); 365 return v.size(); 366 } 367 split_exact(const std::string & s,std::vector<std::string> & v)368 size_t split_exact( const std::string& s, 369 std::vector<std::string>& v ){ 370 v = local_split_at_first_of( s, " \r\t\n", true ); 371 return v.size(); 372 } 373 split_exact_at(const std::string & s,std::vector<std::string> & v,const std::string & m)374 size_t split_exact_at( const std::string& s, 375 std::vector<std::string>& v, 376 const std::string& m ){ 377 v = local_split_at( s, m, true ); 378 return v.size(); 379 } 380 split_exact_at_first_of(const std::string & s,std::vector<std::string> & v,const std::string & m)381 size_t split_exact_at_first_of( const std::string& s, 382 std::vector<std::string>& v, 383 const std::string& m ){ 384 v = local_split_at_first_of( s, m, true ); 385 return v.size(); 386 } 387 join(const vector<string> & vec,const string & sep)388 string join( const vector<string>& vec, const string& sep ){ 389 string result; 390 for ( const auto& s : vec ){ 391 result += s; 392 if ( &s != &vec.back() ){ 393 result += sep; 394 } 395 } 396 return result; 397 } 398 format_nonascii(const string & s)399 string format_nonascii( const string& s ){ 400 /// format weird strings (like UTF8, LATIN1) printable 401 // useful for debugging 402 stringstream os; 403 os << showbase << hex; 404 for ( const auto& c : s ){ 405 if ( isprint(c) && (int)c > 31 ){ 406 os << c; 407 } 408 else { 409 os << "-" << (short int)c << "-"; 410 } 411 } 412 os << noshowbase << dec; 413 return os.str(); 414 } 415 basename(const string & path)416 string basename( const string& path ){ 417 /// extract the basename of a path/filename 418 string::size_type pos = path.rfind( "/" ); 419 if ( pos != string::npos ){ 420 return path.substr(pos+1); 421 } 422 else { 423 return path; 424 } 425 } 426 dirname(const string & path)427 string dirname( const string& path ){ 428 /// extract the dirname of a path/filename 429 string::size_type pos = path.rfind( "/" ); 430 if ( pos != string::npos ){ 431 return path.substr(0,pos); 432 } 433 else { 434 return "."; 435 } 436 } 437 realpath(const string & path)438 string realpath( const string& path ){ 439 /// give the 'real' pathname for a relative path/filename 440 string result; 441 if ( path.empty() ){ 442 return result; 443 } 444 const char *in = path.c_str(); 445 char *out = 0; 446 char *res = ::realpath( in, out ); 447 if ( res ){ 448 result = string(res); 449 free( res); 450 } 451 return result; 452 } 453 454 } // namespace TiCC 455