1 /* Copyright (C) 2012, 2014, 2018 Olga Yakovleva <yakovleva.o.v@gmail.com> */ 2 3 /* This program is free software: you can redistribute it and/or modify */ 4 /* it under the terms of the GNU Lesser General Public License as published by */ 5 /* the Free Software Foundation, either version 2.1 of the License, or */ 6 /* (at your option) any later version. */ 7 8 /* This program is distributed in the hope that it will be useful, */ 9 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ 10 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ 11 /* GNU Lesser General Public License for more details. */ 12 13 /* You should have received a copy of the GNU Lesser General Public License */ 14 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */ 15 16 #ifndef RHVOICE_STR_HPP 17 #define RHVOICE_STR_HPP 18 19 #include <algorithm> 20 #include <functional> 21 #include <iterator> 22 #include <utility> 23 #include <string> 24 #include <sstream> 25 #include <locale> 26 #include "stdexcept" 27 #include "unicode.hpp" 28 #include "utf8.h" 29 #include "utf.hpp" 30 31 namespace RHVoice 32 { 33 namespace str 34 { 35 using unicode::tolower; 36 using unicode::toupper; 37 38 typedef utf::text_iterator<std::string::const_iterator> utf8_string_iterator; 39 utf8_string_begin(const std::string & s)40 inline utf8_string_iterator utf8_string_begin(const std::string& s) 41 { 42 return utf8_string_iterator(s.begin(),s.begin(),s.end()); 43 } 44 utf8_string_end(const std::string & s)45 inline utf8_string_iterator utf8_string_end(const std::string& s) 46 { 47 return utf8_string_iterator(s.end(),s.begin(),s.end()); 48 } 49 50 template<typename output_iterator> 51 class utf8_insert_iterator: public std::iterator<std::output_iterator_tag,void,void,void,void> 52 { 53 public: utf8_insert_iterator()54 utf8_insert_iterator() 55 { 56 } 57 utf8_insert_iterator(output_iterator pos_)58 explicit utf8_insert_iterator(output_iterator pos_): 59 pos(pos_) 60 { 61 } 62 utf8_insert_iterator(const utf8_insert_iterator & other)63 utf8_insert_iterator(const utf8_insert_iterator& other): 64 pos(other.pos) 65 { 66 } 67 operator =(const utf8_insert_iterator & other)68 utf8_insert_iterator& operator=(const utf8_insert_iterator& other) 69 { 70 pos=other.pos; 71 return *this; 72 } 73 operator =(utf8::uint32_t c)74 utf8_insert_iterator& operator=(utf8::uint32_t c) 75 { 76 utf8::append(c,pos); 77 return *this; 78 } 79 operator *()80 utf8_insert_iterator& operator*() 81 { 82 return *this; 83 } 84 operator ++()85 utf8_insert_iterator& operator++() 86 { 87 return *this; 88 } 89 operator ++(int)90 utf8_insert_iterator& operator++(int) 91 { 92 return *this; 93 } 94 95 private: 96 output_iterator pos; 97 }; 98 99 template<typename output_iterator> utf8_inserter(output_iterator pos)100 inline utf8_insert_iterator<output_iterator> utf8_inserter(output_iterator pos) 101 { 102 return utf8_insert_iterator<output_iterator>(pos); 103 } 104 105 class append_string_iterator: public std::iterator<std::output_iterator_tag,void,void,void,void> 106 { 107 public: append_string_iterator()108 append_string_iterator(): 109 destination_string(0) 110 { 111 } 112 append_string_iterator(const append_string_iterator & other)113 append_string_iterator(const append_string_iterator& other): 114 destination_string(other.destination_string) 115 { 116 } 117 append_string_iterator(std::string & dest_str)118 explicit append_string_iterator(std::string& dest_str): 119 destination_string(&dest_str) 120 { 121 } 122 operator =(const append_string_iterator & other)123 append_string_iterator& operator=(const append_string_iterator& other) 124 { 125 destination_string=other.destination_string; 126 return *this; 127 } 128 operator =(const std::string & s)129 append_string_iterator& operator=(const std::string& s) 130 { 131 destination_string->append(s); 132 return *this; 133 } 134 operator *()135 append_string_iterator& operator*() 136 { 137 return *this; 138 } 139 operator ++()140 append_string_iterator& operator++() 141 { 142 return *this; 143 } 144 operator ++(int)145 append_string_iterator& operator++(int) 146 { 147 return *this; 148 } 149 150 private: 151 std::string* destination_string; 152 }; 153 startswith(const std::string & s1,const std::string & s2)154 inline bool startswith(const std::string& s1,const std::string& s2) 155 { 156 return (s1.size()<s2.size())?false:(s1.substr(0,s2.size())==s2); 157 } 158 endswith(const std::string & s1,const std::string & s2)159 inline bool endswith(const std::string& s1,const std::string& s2) 160 { 161 return (s1.size()<s2.size())?false:(s1.substr(s1.size()-s2.size(),s2.size())==s2); 162 } 163 isspace(utf8::uint32_t c)164 inline bool isspace(utf8::uint32_t c) 165 { 166 return (unicode::properties(c)&unicode::property_white_space); 167 } 168 isupper(utf8::uint32_t c)169 inline bool isupper(utf8::uint32_t c) 170 { 171 return ((unicode::properties(c)&unicode::property_uppercase)||(unicode::category(c)==unicode::category_Lt)); 172 } 173 islower(utf8::uint32_t c)174 inline bool islower(utf8::uint32_t c) 175 { 176 return (unicode::properties(c)&unicode::property_lowercase); 177 } 178 isalpha(utf8::uint32_t c)179 inline bool isalpha(utf8::uint32_t c) 180 { 181 return (unicode::properties(c)&unicode::property_alphabetic); 182 } 183 isquote(utf8::uint32_t c)184 inline bool isquote(utf8::uint32_t c) 185 { 186 return (unicode::properties(c)&unicode::property_quotation_mark); 187 } 188 isdash(utf8::uint32_t c)189 inline bool isdash(utf8::uint32_t c) 190 { 191 return (unicode::properties(c)&unicode::property_dash); 192 } 193 ispunct(utf8::uint32_t c)194 inline bool ispunct(utf8::uint32_t c) 195 { 196 return (unicode::category(c).major_class=='P'); 197 } 198 istermpunct(utf8::uint32_t c)199 inline bool istermpunct(utf8::uint32_t c) 200 { 201 return (unicode::properties(c)&unicode::property_terminal_punctuation); 202 } 203 issterm(utf8::uint32_t c)204 inline bool issterm(utf8::uint32_t c) 205 { 206 return (unicode::properties(c)&unicode::property_sterm); 207 } 208 isdigit(utf8::uint32_t c)209 inline bool isdigit(utf8::uint32_t c) 210 { 211 return (unicode::category(c)==unicode::category_Nd); 212 } 213 isadigit(utf8::uint32_t c)214 inline bool isadigit(utf8::uint32_t c) 215 { 216 return ((c>='0')&&(c<='9')); 217 } 218 issym(utf8::uint32_t c)219 inline bool issym(utf8::uint32_t c) 220 { 221 return (unicode::category(c).major_class=='S'); 222 } 223 224 struct is_space: public std::unary_function<utf8::uint32_t,bool> 225 { operator ()RHVoice::str::is_space226 bool operator()(utf8::uint32_t c) const 227 { 228 return isspace(c); 229 } 230 }; 231 232 struct is_upper: public std::unary_function<utf8::uint32_t,bool> 233 { operator ()RHVoice::str::is_upper234 bool operator()(utf8::uint32_t c) const 235 { 236 return isupper(c); 237 } 238 }; 239 240 struct is_lower: public std::unary_function<utf8::uint32_t,bool> 241 { operator ()RHVoice::str::is_lower242 bool operator()(utf8::uint32_t c) const 243 { 244 return islower(c); 245 } 246 }; 247 248 struct is_alpha: public std::unary_function<utf8::uint32_t,bool> 249 { operator ()RHVoice::str::is_alpha250 bool operator()(utf8::uint32_t c) const 251 { 252 return isalpha(c); 253 } 254 }; 255 256 struct is_quote: public std::unary_function<utf8::uint32_t,bool> 257 { operator ()RHVoice::str::is_quote258 bool operator()(utf8::uint32_t c) const 259 { 260 return isquote(c); 261 } 262 }; 263 264 struct is_dash: public std::unary_function<utf8::uint32_t,bool> 265 { operator ()RHVoice::str::is_dash266 bool operator()(utf8::uint32_t c) const 267 { 268 return isdash(c); 269 } 270 }; 271 272 struct is_punct: public std::unary_function<utf8::uint32_t,bool> 273 { operator ()RHVoice::str::is_punct274 bool operator()(utf8::uint32_t c) const 275 { 276 return ispunct(c); 277 } 278 }; 279 280 struct is_termpunct: public std::unary_function<utf8::uint32_t,bool> 281 { operator ()RHVoice::str::is_termpunct282 bool operator()(utf8::uint32_t c) const 283 { 284 return istermpunct(c); 285 } 286 }; 287 288 struct is_sterm: public std::unary_function<utf8::uint32_t,bool> 289 { operator ()RHVoice::str::is_sterm290 bool operator()(utf8::uint32_t c) const 291 { 292 return issterm(c); 293 } 294 }; 295 296 struct is_digit: public std::unary_function<utf8::uint32_t,bool> 297 { operator ()RHVoice::str::is_digit298 bool operator()(utf8::uint32_t c) const 299 { 300 return isdigit(c); 301 } 302 }; 303 304 struct is_adigit: public std::unary_function<utf8::uint32_t,bool> 305 { operator ()RHVoice::str::is_adigit306 bool operator()(utf8::uint32_t c) const 307 { 308 return isadigit(c); 309 } 310 }; 311 312 struct is_equal_to: public std::unary_function<utf8::uint32_t,bool> 313 { is_equal_toRHVoice::str::is_equal_to314 explicit is_equal_to(utf8::uint32_t c): 315 chr(c) 316 { 317 } 318 operator ()RHVoice::str::is_equal_to319 bool operator()(utf8::uint32_t c) const 320 { 321 return (c==chr); 322 } 323 324 private: 325 utf8::uint32_t chr; 326 }; 327 trim(const std::string & s)328 inline std::string trim(const std::string& s) 329 { 330 std::string::const_iterator last=s.end(); 331 std::string::const_iterator tmp=last; 332 while(last!=s.begin()) 333 { 334 if(isspace(utf8::prior(tmp,s.begin()))) 335 last=tmp; 336 else 337 break; 338 } 339 std::string::const_iterator first=s.begin(); 340 tmp=first; 341 while(first!=last) 342 { 343 if(isspace(utf8::next(tmp,last))) 344 first=tmp; 345 else 346 break; 347 } 348 std::string result(first,last); 349 return result; 350 } 351 equal(const std::string & s1,const std::string & s2)352 inline bool equal(const std::string& s1,const std::string& s2) 353 { 354 std::string::const_iterator pos1=s1.begin(); 355 std::string::const_iterator pos2=s2.begin(); 356 while((pos1!=s1.end())&&(pos2!=s2.end())) 357 { 358 if(tolower(utf8::next(pos1,s1.end()))!=tolower(utf8::next(pos2,s2.end()))) 359 return false; 360 } 361 return ((pos1==s1.end())&&(pos2==s2.end())); 362 } 363 364 struct less: public std::binary_function<const std::string&,const std::string&,bool> 365 { operator ()RHVoice::str::less366 bool operator()(const std::string& s1,const std::string& s2) const 367 { 368 std::string::const_iterator pos1=s1.begin(); 369 std::string::const_iterator pos2=s2.begin(); 370 utf8::uint32_t cp1,cp2; 371 while((pos1!=s1.end())&&(pos2!=s2.end())) 372 { 373 cp1=tolower(utf8::next(pos1,s1.end())); 374 cp2=tolower(utf8::next(pos2,s2.end())); 375 if(cp1!=cp2) 376 return (cp1<cp2); 377 } 378 return ((pos1==s1.end())&&(pos2!=s2.end())); 379 } 380 }; 381 382 template<typename delimiter_predicate> 383 class tokenizer 384 { 385 public: 386 class iterator: public std::iterator<std::input_iterator_tag,const std::string> 387 { 388 public: iterator(const utf8_string_iterator & first_,const utf8_string_iterator & last_,delimiter_predicate pred)389 iterator(const utf8_string_iterator& first_,const utf8_string_iterator& last_,delimiter_predicate pred): 390 first(first_), 391 last(first_), 392 end(last_), 393 is_delimiter(pred) 394 { 395 ++(*this); 396 } 397 operator *() const398 const std::string& operator*() const 399 { 400 return value; 401 } 402 operator ->() const403 const std::string* operator->() const 404 { 405 return &value; 406 } 407 operator ==(const iterator & other) const408 bool operator==(const iterator& other) const 409 { 410 return ((first==other.first)&&(last==other.last)&&(end==other.end)); 411 } 412 operator !=(const iterator & other) const413 bool operator!=(const iterator& other) const 414 { 415 return !(*this==other); 416 } 417 operator ++()418 iterator& operator++() 419 { 420 first=std::find_if(last,end,std::not1(is_delimiter)); 421 last=std::find_if(first,end,is_delimiter); 422 value.assign(first.base(),last.base()); 423 return *this; 424 } 425 operator ++(int)426 iterator operator++(int) 427 { 428 iterator tmp=*this; 429 ++(*this); 430 return tmp; 431 } 432 433 private: 434 utf8_string_iterator first,last,end; 435 std::string value; 436 delimiter_predicate is_delimiter; 437 }; 438 tokenizer(const std::string & s,delimiter_predicate pred=delimiter_predicate ())439 tokenizer(const std::string& s,delimiter_predicate pred=delimiter_predicate()): 440 text(s), 441 is_delimiter(pred) 442 { 443 } 444 assign(const std::string & s)445 void assign(const std::string& s) 446 { 447 text=s; 448 } 449 begin() const450 iterator begin() const 451 { 452 return iterator(utf8_string_begin(text),utf8_string_end(text),is_delimiter); 453 } 454 end() const455 iterator end() const 456 { 457 return iterator(utf8_string_end(text),utf8_string_end(text),is_delimiter); 458 } 459 460 private: 461 std::string text; 462 delimiter_predicate is_delimiter; 463 }; 464 count_newlines(input_iterator start,input_iterator end)465 template<typename input_iterator> std::size_t count_newlines(input_iterator start,input_iterator end) 466 { 467 utf8::uint32_t chr; 468 utf8::uint32_t prev_chr=0; 469 std::size_t n=0; 470 for(input_iterator it=start;it!=end;++it) 471 { 472 chr=*it; 473 if(((chr=='\n')&&(prev_chr!='\r'))||(chr=='\r')||(chr==0x85)||(chr==0x2028)||(chr==0x2029)) 474 ++n; 475 prev_chr=chr; 476 } 477 return n; 478 } 479 480 struct to_lower 481 { operator ()RHVoice::str::to_lower482 utf8::uint32_t operator()(utf8::uint32_t c) const 483 { 484 return tolower(c); 485 } 486 operator ()RHVoice::str::to_lower487 std::string operator()(const std::string& s) const 488 { 489 std::string result; 490 std::transform(utf8_string_begin(s),utf8_string_end(s),utf8_inserter(std::back_inserter(result)),tolower); 491 return result; 492 } 493 }; 494 495 template<typename T> to_string(const T & v)496 std::string to_string(const T& v) 497 { 498 std::ostringstream s; 499 s.imbue(std::locale::classic()); 500 s << v; 501 return s.str(); 502 } 503 504 template<typename T> from_string(const std::string & s)505 T from_string(const std::string& s) 506 { 507 std::istringstream strm(s); 508 strm.imbue(std::locale::classic()); 509 T result; 510 if(strm>>result) 511 return result; 512 else 513 throw std::invalid_argument("Invalid type representation as a string"); 514 } 515 is_single_char(const std::string & s)516 inline bool is_single_char(const std::string& s) 517 { 518 if(s.empty()) 519 return false; 520 utf8_string_iterator it=utf8_string_begin(s); 521 ++it; 522 return (it==utf8_string_end(s)); 523 } 524 } 525 } 526 #endif 527