1 /* Copyright (C) 2012, 2014, 2018  Olga Yakovleva <yakovleva.o.v@gmail.com> */
2 
3 /* This program is free software: you can redistribute it and/or modify */
4 /* it under the terms of the GNU Lesser General Public License as published by */
5 /* the Free Software Foundation, either version 2.1 of the License, or */
6 /* (at your option) any later version. */
7 
8 /* This program is distributed in the hope that it will be useful, */
9 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
10 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the */
11 /* GNU Lesser General Public License for more details. */
12 
13 /* You should have received a copy of the GNU Lesser General Public License */
14 /* along with this program.  If not, see <http://www.gnu.org/licenses/>. */
15 
16 #ifndef RHVOICE_STR_HPP
17 #define RHVOICE_STR_HPP
18 
19 #include <algorithm>
20 #include <functional>
21 #include <iterator>
22 #include <utility>
23 #include <string>
24 #include <sstream>
25 #include <locale>
26 #include "stdexcept"
27 #include "unicode.hpp"
28 #include "utf8.h"
29 #include "utf.hpp"
30 
31 namespace RHVoice
32 {
33   namespace str
34   {
35     using unicode::tolower;
36     using unicode::toupper;
37 
38     typedef utf::text_iterator<std::string::const_iterator> utf8_string_iterator;
39 
utf8_string_begin(const std::string & s)40     inline utf8_string_iterator utf8_string_begin(const std::string& s)
41     {
42       return utf8_string_iterator(s.begin(),s.begin(),s.end());
43     }
44 
utf8_string_end(const std::string & s)45     inline utf8_string_iterator utf8_string_end(const std::string& s)
46     {
47       return utf8_string_iterator(s.end(),s.begin(),s.end());
48     }
49 
50     template<typename output_iterator>
51     class utf8_insert_iterator: public std::iterator<std::output_iterator_tag,void,void,void,void>
52     {
53     public:
utf8_insert_iterator()54       utf8_insert_iterator()
55       {
56       }
57 
utf8_insert_iterator(output_iterator pos_)58       explicit utf8_insert_iterator(output_iterator pos_):
59       pos(pos_)
60       {
61       }
62 
utf8_insert_iterator(const utf8_insert_iterator & other)63       utf8_insert_iterator(const utf8_insert_iterator& other):
64         pos(other.pos)
65       {
66       }
67 
operator =(const utf8_insert_iterator & other)68       utf8_insert_iterator& operator=(const utf8_insert_iterator& other)
69       {
70         pos=other.pos;
71         return *this;
72       }
73 
operator =(utf8::uint32_t c)74       utf8_insert_iterator& operator=(utf8::uint32_t c)
75       {
76         utf8::append(c,pos);
77         return *this;
78       }
79 
operator *()80       utf8_insert_iterator& operator*()
81       {
82         return *this;
83       }
84 
operator ++()85       utf8_insert_iterator& operator++()
86       {
87         return *this;
88       }
89 
operator ++(int)90       utf8_insert_iterator& operator++(int)
91       {
92         return *this;
93       }
94 
95     private:
96       output_iterator pos;
97     };
98 
99     template<typename output_iterator>
utf8_inserter(output_iterator pos)100     inline utf8_insert_iterator<output_iterator> utf8_inserter(output_iterator pos)
101     {
102       return utf8_insert_iterator<output_iterator>(pos);
103     }
104 
105     class append_string_iterator: public std::iterator<std::output_iterator_tag,void,void,void,void>
106     {
107     public:
append_string_iterator()108       append_string_iterator():
109         destination_string(0)
110       {
111       }
112 
append_string_iterator(const append_string_iterator & other)113       append_string_iterator(const append_string_iterator& other):
114       destination_string(other.destination_string)
115       {
116       }
117 
append_string_iterator(std::string & dest_str)118       explicit append_string_iterator(std::string& dest_str):
119         destination_string(&dest_str)
120       {
121       }
122 
operator =(const append_string_iterator & other)123       append_string_iterator& operator=(const append_string_iterator& other)
124       {
125         destination_string=other.destination_string;
126         return *this;
127       }
128 
operator =(const std::string & s)129       append_string_iterator& operator=(const std::string& s)
130       {
131         destination_string->append(s);
132         return *this;
133       }
134 
operator *()135       append_string_iterator& operator*()
136       {
137         return *this;
138       }
139 
operator ++()140       append_string_iterator& operator++()
141       {
142         return *this;
143       }
144 
operator ++(int)145       append_string_iterator& operator++(int)
146       {
147         return *this;
148       }
149 
150     private:
151       std::string* destination_string;
152     };
153 
startswith(const std::string & s1,const std::string & s2)154     inline bool startswith(const std::string& s1,const std::string& s2)
155     {
156       return (s1.size()<s2.size())?false:(s1.substr(0,s2.size())==s2);
157     }
158 
endswith(const std::string & s1,const std::string & s2)159     inline bool endswith(const std::string& s1,const std::string& s2)
160     {
161       return (s1.size()<s2.size())?false:(s1.substr(s1.size()-s2.size(),s2.size())==s2);
162     }
163 
isspace(utf8::uint32_t c)164     inline bool isspace(utf8::uint32_t c)
165     {
166       return (unicode::properties(c)&unicode::property_white_space);
167     }
168 
isupper(utf8::uint32_t c)169     inline bool isupper(utf8::uint32_t c)
170     {
171       return ((unicode::properties(c)&unicode::property_uppercase)||(unicode::category(c)==unicode::category_Lt));
172     }
173 
islower(utf8::uint32_t c)174     inline bool islower(utf8::uint32_t c)
175     {
176       return (unicode::properties(c)&unicode::property_lowercase);
177     }
178 
isalpha(utf8::uint32_t c)179     inline bool isalpha(utf8::uint32_t c)
180     {
181       return (unicode::properties(c)&unicode::property_alphabetic);
182     }
183 
isquote(utf8::uint32_t c)184     inline bool isquote(utf8::uint32_t c)
185     {
186       return (unicode::properties(c)&unicode::property_quotation_mark);
187     }
188 
isdash(utf8::uint32_t c)189     inline bool isdash(utf8::uint32_t c)
190     {
191       return (unicode::properties(c)&unicode::property_dash);
192     }
193 
ispunct(utf8::uint32_t c)194     inline bool ispunct(utf8::uint32_t c)
195     {
196       return (unicode::category(c).major_class=='P');
197     }
198 
istermpunct(utf8::uint32_t c)199     inline bool istermpunct(utf8::uint32_t c)
200     {
201       return (unicode::properties(c)&unicode::property_terminal_punctuation);
202     }
203 
issterm(utf8::uint32_t c)204     inline bool issterm(utf8::uint32_t c)
205     {
206       return (unicode::properties(c)&unicode::property_sterm);
207     }
208 
isdigit(utf8::uint32_t c)209     inline bool isdigit(utf8::uint32_t c)
210     {
211       return (unicode::category(c)==unicode::category_Nd);
212     }
213 
isadigit(utf8::uint32_t c)214     inline bool isadigit(utf8::uint32_t c)
215     {
216       return ((c>='0')&&(c<='9'));
217     }
218 
issym(utf8::uint32_t c)219     inline bool issym(utf8::uint32_t c)
220     {
221       return (unicode::category(c).major_class=='S');
222     }
223 
224     struct is_space: public std::unary_function<utf8::uint32_t,bool>
225     {
operator ()RHVoice::str::is_space226       bool operator()(utf8::uint32_t c) const
227       {
228         return isspace(c);
229       }
230     };
231 
232     struct is_upper: public std::unary_function<utf8::uint32_t,bool>
233     {
operator ()RHVoice::str::is_upper234       bool operator()(utf8::uint32_t c) const
235       {
236         return isupper(c);
237       }
238     };
239 
240     struct is_lower: public std::unary_function<utf8::uint32_t,bool>
241     {
operator ()RHVoice::str::is_lower242       bool operator()(utf8::uint32_t c) const
243       {
244         return islower(c);
245       }
246     };
247 
248     struct is_alpha: public std::unary_function<utf8::uint32_t,bool>
249     {
operator ()RHVoice::str::is_alpha250       bool operator()(utf8::uint32_t c) const
251       {
252         return isalpha(c);
253       }
254     };
255 
256     struct is_quote: public std::unary_function<utf8::uint32_t,bool>
257     {
operator ()RHVoice::str::is_quote258       bool operator()(utf8::uint32_t c) const
259       {
260         return isquote(c);
261       }
262     };
263 
264     struct is_dash: public std::unary_function<utf8::uint32_t,bool>
265     {
operator ()RHVoice::str::is_dash266       bool operator()(utf8::uint32_t c) const
267       {
268         return isdash(c);
269       }
270     };
271 
272     struct is_punct: public std::unary_function<utf8::uint32_t,bool>
273     {
operator ()RHVoice::str::is_punct274       bool operator()(utf8::uint32_t c) const
275       {
276         return ispunct(c);
277       }
278     };
279 
280     struct is_termpunct: public std::unary_function<utf8::uint32_t,bool>
281     {
operator ()RHVoice::str::is_termpunct282       bool operator()(utf8::uint32_t c) const
283       {
284         return istermpunct(c);
285       }
286     };
287 
288     struct is_sterm: public std::unary_function<utf8::uint32_t,bool>
289     {
operator ()RHVoice::str::is_sterm290       bool operator()(utf8::uint32_t c) const
291       {
292         return issterm(c);
293       }
294     };
295 
296     struct is_digit: public std::unary_function<utf8::uint32_t,bool>
297     {
operator ()RHVoice::str::is_digit298       bool operator()(utf8::uint32_t c) const
299       {
300         return isdigit(c);
301       }
302     };
303 
304     struct is_adigit: public std::unary_function<utf8::uint32_t,bool>
305     {
operator ()RHVoice::str::is_adigit306       bool operator()(utf8::uint32_t c) const
307       {
308         return isadigit(c);
309       }
310     };
311 
312     struct is_equal_to: public std::unary_function<utf8::uint32_t,bool>
313     {
is_equal_toRHVoice::str::is_equal_to314       explicit is_equal_to(utf8::uint32_t c):
315       chr(c)
316       {
317       }
318 
operator ()RHVoice::str::is_equal_to319       bool operator()(utf8::uint32_t c) const
320       {
321         return (c==chr);
322       }
323 
324     private:
325       utf8::uint32_t chr;
326     };
327 
trim(const std::string & s)328     inline std::string trim(const std::string& s)
329     {
330       std::string::const_iterator last=s.end();
331       std::string::const_iterator tmp=last;
332       while(last!=s.begin())
333         {
334           if(isspace(utf8::prior(tmp,s.begin())))
335             last=tmp;
336           else
337             break;
338         }
339       std::string::const_iterator first=s.begin();
340       tmp=first;
341       while(first!=last)
342         {
343           if(isspace(utf8::next(tmp,last)))
344             first=tmp;
345           else
346             break;
347         }
348       std::string result(first,last);
349       return result;
350     }
351 
equal(const std::string & s1,const std::string & s2)352     inline bool equal(const std::string& s1,const std::string& s2)
353     {
354       std::string::const_iterator pos1=s1.begin();
355       std::string::const_iterator pos2=s2.begin();
356       while((pos1!=s1.end())&&(pos2!=s2.end()))
357         {
358           if(tolower(utf8::next(pos1,s1.end()))!=tolower(utf8::next(pos2,s2.end())))
359             return false;
360         }
361       return ((pos1==s1.end())&&(pos2==s2.end()));
362     }
363 
364     struct less: public std::binary_function<const std::string&,const std::string&,bool>
365     {
operator ()RHVoice::str::less366       bool operator()(const std::string& s1,const std::string& s2) const
367       {
368         std::string::const_iterator pos1=s1.begin();
369         std::string::const_iterator pos2=s2.begin();
370         utf8::uint32_t cp1,cp2;
371         while((pos1!=s1.end())&&(pos2!=s2.end()))
372           {
373             cp1=tolower(utf8::next(pos1,s1.end()));
374             cp2=tolower(utf8::next(pos2,s2.end()));
375             if(cp1!=cp2)
376               return (cp1<cp2);
377           }
378         return ((pos1==s1.end())&&(pos2!=s2.end()));
379       }
380     };
381 
382     template<typename delimiter_predicate>
383     class tokenizer
384     {
385     public:
386       class iterator: public std::iterator<std::input_iterator_tag,const std::string>
387       {
388       public:
iterator(const utf8_string_iterator & first_,const utf8_string_iterator & last_,delimiter_predicate pred)389         iterator(const utf8_string_iterator& first_,const utf8_string_iterator& last_,delimiter_predicate pred):
390           first(first_),
391           last(first_),
392           end(last_),
393           is_delimiter(pred)
394         {
395           ++(*this);
396         }
397 
operator *() const398         const std::string& operator*() const
399         {
400           return value;
401         }
402 
operator ->() const403         const std::string* operator->() const
404         {
405           return &value;
406         }
407 
operator ==(const iterator & other) const408         bool operator==(const iterator& other) const
409         {
410           return ((first==other.first)&&(last==other.last)&&(end==other.end));
411         }
412 
operator !=(const iterator & other) const413         bool operator!=(const iterator& other) const
414         {
415           return !(*this==other);
416         }
417 
operator ++()418         iterator& operator++()
419         {
420           first=std::find_if(last,end,std::not1(is_delimiter));
421           last=std::find_if(first,end,is_delimiter);
422           value.assign(first.base(),last.base());
423           return *this;
424         }
425 
operator ++(int)426         iterator operator++(int)
427         {
428           iterator tmp=*this;
429           ++(*this);
430           return tmp;
431         }
432 
433       private:
434         utf8_string_iterator first,last,end;
435         std::string value;
436         delimiter_predicate is_delimiter;
437       };
438 
tokenizer(const std::string & s,delimiter_predicate pred=delimiter_predicate ())439       tokenizer(const std::string& s,delimiter_predicate pred=delimiter_predicate()):
440         text(s),
441         is_delimiter(pred)
442       {
443       }
444 
assign(const std::string & s)445       void assign(const std::string& s)
446       {
447         text=s;
448       }
449 
begin() const450       iterator begin() const
451       {
452         return iterator(utf8_string_begin(text),utf8_string_end(text),is_delimiter);
453       }
454 
end() const455       iterator end() const
456       {
457         return iterator(utf8_string_end(text),utf8_string_end(text),is_delimiter);
458       }
459 
460     private:
461       std::string text;
462       delimiter_predicate is_delimiter;
463     };
464 
count_newlines(input_iterator start,input_iterator end)465     template<typename input_iterator> std::size_t count_newlines(input_iterator start,input_iterator end)
466     {
467       utf8::uint32_t chr;
468       utf8::uint32_t prev_chr=0;
469       std::size_t n=0;
470       for(input_iterator it=start;it!=end;++it)
471         {
472           chr=*it;
473           if(((chr=='\n')&&(prev_chr!='\r'))||(chr=='\r')||(chr==0x85)||(chr==0x2028)||(chr==0x2029))
474             ++n;
475           prev_chr=chr;
476         }
477       return n;
478     }
479 
480     struct to_lower
481     {
operator ()RHVoice::str::to_lower482       utf8::uint32_t operator()(utf8::uint32_t c) const
483       {
484         return tolower(c);
485       }
486 
operator ()RHVoice::str::to_lower487       std::string operator()(const std::string& s) const
488       {
489         std::string result;
490         std::transform(utf8_string_begin(s),utf8_string_end(s),utf8_inserter(std::back_inserter(result)),tolower);
491         return result;
492       }
493     };
494 
495     template<typename T>
to_string(const T & v)496     std::string to_string(const T& v)
497     {
498       std::ostringstream s;
499       s.imbue(std::locale::classic());
500       s << v;
501       return s.str();
502     }
503 
504     template<typename T>
from_string(const std::string & s)505     T from_string(const std::string& s)
506     {
507       std::istringstream strm(s);
508       strm.imbue(std::locale::classic());
509       T result;
510       if(strm>>result)
511         return result;
512       else
513         throw std::invalid_argument("Invalid type representation as a string");
514     }
515 
is_single_char(const std::string & s)516     inline bool is_single_char(const std::string& s)
517     {
518       if(s.empty())
519         return false;
520       utf8_string_iterator it=utf8_string_begin(s);
521       ++it;
522       return (it==utf8_string_end(s));
523     }
524   }
525 }
526 #endif
527