1 /*
2   Copyright (c) 2006 - 2021
3   CLST  - Radboud University
4   ILK   - Tilburg University
5 
6   This file is part of ticcutils
7 
8   ticcutils is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 3 of the License, or
11   (at your option) any later version.
12 
13   ticcutils is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program; if not, see <http://www.gnu.org/licenses/>.
20 
21   For questions and suggestions, see:
22       https://github.com/LanguageMachines/ticcutils/issues
23   or send mail to:
24       lamasoftware (at ) science.ru.nl
25 
26 */
27 
28 #include "ticcutils/StringOps.h"
29 
30 #include <cerrno>
31 #include <cfloat>
32 #include <cstdlib>
33 #include <algorithm>
34 #include <string>
35 #include <iostream>
36 #include <sstream>
37 #include "ticcutils/Version.h"
38 
39 using namespace std;
40 namespace TiCC {
41 
BuildInfo()42   string BuildInfo() {
43     // cannot be defined in the header because otherwise __DATE__ and
44     // __TIME__ would be dynamic. (changing every time it is included)
45     return VersionName() + "-" + Version() + ". Compiled on "
46       + __DATE__ + " " + __TIME__;
47   }
48 
trim(const string & s,const string & chars)49   string trim( const string& s, const string& chars ){
50     /// remove leading and trailing characters from a string
51     /*!
52       \param s the string to trim
53       \param chars the characters to remove. The default is whitespace.
54       When \e chars = "", the following chars are used: " \t\r\n"
55     */
56     string result;
57     if ( !s.empty() ){
58       string::size_type b_pos = s.find_first_not_of( chars );
59       if ( b_pos == string::npos ){
60 	return result; // 'empty' string. only garbage
61       }
62       string::size_type e_pos = s.find_last_not_of( chars );
63       if ( e_pos == string::npos ){
64 	result = s.substr( b_pos );
65       }
66       else {
67 	result = s.substr( b_pos, e_pos-b_pos+1 );
68       }
69     }
70     return result;
71   }
72 
trim_front(const string & s,const string & chars)73   string trim_front( const string& s, const string& chars ){
74     /// remove leading characters from a string
75     /*!
76       \param s the string to trim
77       \param chars the characters to remove. The default is whitespace.
78       When \e chars = "", the following chars are used: " \t\r\n"
79     */
80     string result;
81     if ( !s.empty() ){
82       string::size_type b_pos = s.find_first_not_of( chars );
83       if ( b_pos != string::npos ){
84 	result = s.substr( b_pos );
85       }
86     }
87     return result;
88   }
89 
trim_back(const string & s,const string & chars)90   string trim_back( const string& s, const string& chars ){
91     /// remove trailing characters from a string
92     /*!
93       \param s the string to trim
94       \param chars the characters to remove. The default is whitespace.
95       When \e chars = "", the following chars are used: " \t\r\n"
96     */
97     string result;
98     if ( !s.empty() ){
99       string::size_type e_pos = s.find_last_not_of( chars );
100       if ( e_pos != string::npos ){
101 	result = s.substr( 0, e_pos+1 );
102       }
103     }
104     return result;
105   }
106 
toLower(const int & i)107   static int toLower( const int& i ){ return tolower(i); }
toUpper(const int & i)108   static int toUpper( const int& i ){ return toupper(i); }
109 
to_lower(string & s)110   void to_lower( string& s ){
111     /// convert to all lowercase. Modifies the input.
112     transform( s.begin(), s.end(), s.begin(), toLower );
113   }
114 
to_upper(string & s)115   void to_upper( string& s ){
116     /// convert to all uppercase. Modifies the input.
117     transform( s.begin(), s.end(), s.begin(), toUpper );
118   }
119 
lowercase(const string & s)120   string lowercase( const string& s ){
121     /// return a lowercased copy of the inputstring
122     string result = s;
123     to_lower( result );
124     return result;
125   }
126 
uppercase(const string & s)127   string uppercase( const string& s ){
128     /// return a uppercased copy of the inputstring
129     string result = s;
130     to_upper( result );
131     return result;
132   }
133 
match_back(const std::string & s,const std::string & tail)134   bool match_back( const std::string& s, const std::string& tail ){
135     /// check if a string matches another at the back
136     /*!
137       \param s the string
138       \param tail the string we search in \e s
139       \return true is \e tail is the last pasrt of \e s
140     */
141     int res = -2;
142     try {
143       res = s.compare( s.length() - tail.length(), tail.length(), tail );
144     }
145     catch ( ... ){
146     }
147     if ( res == 0  ){
148       return true;
149     }
150     return false;
151   }
152 
match_front(const std::string & s,const std::string & head)153   bool match_front( const std::string& s, const std::string& head ){
154     /// check if a string matches another at the front
155     /*!
156       \param s the string
157       \param head the string we search in \e s
158       \return true is \e head is the first part of \e s
159     */
160     int res = s.compare(0,head.length(),head);
161     if ( res == 0  ){
162       return true;
163     }
164     return false;
165   }
166 
local_split_at(const string & src,const string & sep,bool exact)167   static vector<string> local_split_at( const string& src,
168 					const string& sep,
169 					bool exact ){
170     /// split a string into substrings.
171     /*!
172       \param src the string to split
173       \param sep a separator string. This may be a multi-character string.
174       \param exact normally, we silently skip empty entries (e.g. when two or
175       more separators co-incide), but not when exact=true. In that case result
176       may contain empty strings.
177       \return a vector of substrings
178     */
179     if ( sep.empty() ){
180       throw runtime_error( "TiCC::split_at(): separator is empty!" );
181     }
182     vector<string> results;
183     string::size_type pos = 0;
184     while ( pos != string::npos ){
185       string res;
186       string::size_type p = src.find( sep, pos );
187       if ( p == string::npos ){
188 	res = src.substr( pos );
189 	pos = p;
190       }
191       else {
192 	res = src.substr( pos, p - pos );
193 	pos = p + sep.length();
194       }
195       if ( !res.empty() || exact ){
196 	results.push_back( res );
197       }
198     }
199     return results;
200   }
201 
local_split_at(const string & src,const string & sep,size_t max)202   static vector<string> local_split_at( const string& src,
203 					const string& sep,
204 					size_t max ){
205     /// split a string into substrings.
206     /*!
207       \param src the string to split
208       \param sep a separator string. May be a multi-character string.
209       \param max if max > 0, limit the size of the result to \e max strings,
210       leaving the remainder in the last part of the result
211       \return a vector of split parts
212     */
213     if ( sep.empty() ){
214       throw runtime_error( "TiCC::split_at(): separator is empty!" );
215     }
216     vector<string> results;
217     size_t cnt = 0;
218     string::size_type pos = 0;
219     while ( pos != string::npos ){
220       string res;
221       string::size_type p = src.find( sep, pos );
222       if ( p == string::npos ){
223 	res = src.substr( pos );
224 	pos = p;
225       }
226       else {
227 	res = src.substr( pos, p - pos );
228 	pos = p + sep.length();
229       }
230       if ( !res.empty() ){
231 	++cnt;
232 	results.push_back( res );
233       }
234       if ( max != 0 && cnt >= max-1 ){
235 	if ( pos != string::npos ){
236 	  results.push_back( src.substr( pos ) );
237 	}
238 	break;
239       }
240     }
241     return results;
242   }
243 
local_split_at_first_of(const string & src,const string & seps,bool exact)244   static vector<string> local_split_at_first_of( const string& src,
245 						 const string& seps,
246 						 bool exact ){
247     /// split a string into substrings.
248     /*!
249       \param src the string to split
250       \param seps a string with separator characters. one of those should match
251       for a split to happen.
252       \param exact normally, we silently skip empty entries (e.g. when two or
253       more separators co-incide), but not when exact=true. In that case result
254       may contain empty strings.
255       \return a vector of split parts
256     */
257     if ( seps.empty() ){
258       throw runtime_error( "TiCC::split_at_first_of(): separators are empty!" );
259     }
260     vector<string> results;
261     string::size_type s = 0;
262     while ( s != string::npos ){
263       string res;
264       string::size_type e = src.find_first_of( seps, s );
265       if ( e == string::npos ){
266 	res = src.substr( s );
267 	s = e;
268       }
269       else {
270 	res = src.substr( s, e - s );
271 	s = e+1;
272       }
273       if ( !res.empty() || exact ){
274 	results.push_back( res );
275       }
276     }
277     return results;
278   }
279 
local_split_at_first_of(const string & src,const string & seps,size_t max)280   static vector<string> local_split_at_first_of( const string& src,
281 						 const string& seps,
282 						 size_t max ){
283     /// split a string into substrings.
284     /*!
285       \param src the string to split
286       \param seps a string with separator characters. one of those should match
287       for a split to happen.
288       \param max if max > 0, limit the size of the result to \e max strings,
289       leaving the remainder in the last part of the result
290       \return a vector of split parts
291     */
292     if ( seps.empty() ){
293       throw runtime_error( "TiCC::split_at_first_of(): separators are empty!" );
294     }
295     vector<string> results;
296     size_t cnt = 0;
297     string::size_type pos = 0;
298     while ( pos != string::npos ){
299       string res;
300       string::size_type e = src.find_first_of( seps, pos );
301       if ( e == string::npos ){
302 	res = src.substr( pos );
303 	pos = e;
304       }
305       else {
306 	res = src.substr( pos, e - pos );
307 	pos = e+1;
308       }
309       if ( !res.empty() ){
310 	results.push_back( res );
311 	++cnt;
312       }
313       if ( max != 0 && cnt >= max-1 ){
314 	if ( pos != string::npos ){
315 	  results.push_back( src.substr( pos ) );
316 	}
317 	break;
318       }
319     }
320     return results;
321   }
322 
split_at(const string & src,const string & sep,size_t max)323   vector<string> split_at( const string& src,
324 			   const string& sep,
325 			   size_t max ){
326     /// split a string into substrings.
327     /*!
328       \param src the string to split
329       \param sep a separator string. May be a multi-character string.
330       \param max if max > 0, limit the size of the result to \e max strings,
331       leaving the remainder in the last part of the result
332       \return a vector of split parts
333     */
334     return local_split_at( src, sep, max );
335   }
336 
split_at(const std::string & s,std::vector<std::string> & v,const std::string & seps)337   size_t split_at( const std::string& s,
338 		   std::vector<std::string>& v,
339 		   const std::string& seps ){
340     v = local_split_at( s, seps, false );
341     return v.size();
342   }
343 
split(const std::string & s,size_t num)344   std::vector<std::string> split( const std::string& s,
345 				  size_t num ){
346     return local_split_at_first_of( s, " \r\t\n", num );
347   }
348 
split_at_first_of(const string & s,const string & seps,size_t num)349   vector<string> split_at_first_of( const string& s,
350 				    const string& seps,
351 				    size_t num ){
352     return local_split_at_first_of( s, seps, num );
353   }
354 
split_at_first_of(const std::string & s,std::vector<std::string> & v,const std::string & seps)355   size_t split_at_first_of( const std::string& s,
356 			    std::vector<std::string>& v,
357 			    const std::string& seps ){
358     v = local_split_at_first_of( s, seps, false );
359     return v.size();
360   }
361 
split(const std::string & s,std::vector<std::string> & v)362   size_t split( const std::string& s,
363 		std::vector<std::string>& v ){
364     v = local_split_at_first_of( s, " \r\t\n", false );
365     return v.size();
366   }
367 
split_exact(const std::string & s,std::vector<std::string> & v)368   size_t split_exact( const std::string& s,
369 		      std::vector<std::string>& v ){
370     v = local_split_at_first_of( s, " \r\t\n", true );
371     return v.size();
372   }
373 
split_exact_at(const std::string & s,std::vector<std::string> & v,const std::string & m)374   size_t split_exact_at( const std::string& s,
375 			 std::vector<std::string>& v,
376 			 const std::string& m ){
377     v = local_split_at( s, m, true );
378     return v.size();
379   }
380 
split_exact_at_first_of(const std::string & s,std::vector<std::string> & v,const std::string & m)381   size_t split_exact_at_first_of( const std::string& s,
382 				  std::vector<std::string>& v,
383 				  const std::string& m ){
384     v = local_split_at_first_of( s, m, true );
385     return v.size();
386   }
387 
join(const vector<string> & vec,const string & sep)388   string join( const vector<string>& vec, const string& sep ){
389     string result;
390     for ( const auto& s : vec ){
391       result += s;
392       if ( &s != &vec.back() ){
393 	result += sep;
394       }
395     }
396     return result;
397   }
398 
format_nonascii(const string & s)399   string format_nonascii( const string& s ){
400     /// format weird strings (like UTF8, LATIN1) printable
401     // useful for debugging
402     stringstream os;
403     os << showbase << hex;
404     for ( const auto& c : s ){
405       if ( isprint(c) && (int)c > 31 ){
406 	os << c;
407       }
408       else {
409 	os << "-" << (short int)c << "-";
410       }
411     }
412     os << noshowbase << dec;
413     return os.str();
414   }
415 
basename(const string & path)416   string basename( const string& path ){
417     /// extract the basename of a path/filename
418     string::size_type pos = path.rfind( "/" );
419     if ( pos != string::npos ){
420       return path.substr(pos+1);
421     }
422     else {
423       return path;
424     }
425   }
426 
dirname(const string & path)427   string dirname( const string& path ){
428     /// extract the dirname of a path/filename
429     string::size_type pos = path.rfind( "/" );
430     if ( pos != string::npos ){
431       return path.substr(0,pos);
432     }
433     else {
434       return ".";
435     }
436   }
437 
realpath(const string & path)438   string realpath( const string& path ){
439     /// give the 'real' pathname for a relative path/filename
440     string result;
441     if ( path.empty() ){
442       return result;
443     }
444     const char *in = path.c_str();
445     char *out = 0;
446     char *res = ::realpath( in, out );
447     if ( res ){
448       result = string(res);
449       free( res);
450     }
451     return result;
452   }
453 
454 } // namespace TiCC
455