1 // Boost token_functions.hpp ------------------------------------------------// 2 3 // Copyright John R. Bandela 2001. 4 5 // Distributed under the Boost Software License, Version 1.0. (See 6 // accompanying file LICENSE_1_0.txt or copy at 7 // http://www.boost.org/LICENSE_1_0.txt) 8 9 // See http://www.boost.org/libs/tokenizer/ for documentation. 10 11 // Revision History: 12 // 01 Oct 2004 Joaquin M Lopez Munoz 13 // Workaround for a problem with string::assign in msvc-stlport 14 // 06 Apr 2004 John Bandela 15 // Fixed a bug involving using char_delimiter with a true input iterator 16 // 28 Nov 2003 Robert Zeh and John Bandela 17 // Converted into "fast" functions that avoid using += when 18 // the supplied iterator isn't an input_iterator; based on 19 // some work done at Archelon and a version that was checked into 20 // the boost CVS for a short period of time. 21 // 20 Feb 2002 John Maddock 22 // Removed using namespace std declarations and added 23 // workaround for BOOST_NO_STDC_NAMESPACE (the library 24 // can be safely mixed with regex). 25 // 06 Feb 2002 Jeremy Siek 26 // Added char_separator. 27 // 02 Feb 2002 Jeremy Siek 28 // Removed tabs and a little cleanup. 29 30 31 #ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ 32 #define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ 33 34 #include <vector> 35 #include <stdexcept> 36 #include <string> 37 #include <cctype> 38 #include <algorithm> // for find_if 39 #include <boost/config.hpp> 40 #include <boost/assert.hpp> 41 #include <boost/detail/workaround.hpp> 42 #include <boost/mpl/if.hpp> 43 #include <boost/throw_exception.hpp> 44 #if !defined(BOOST_NO_CWCTYPE) 45 #include <cwctype> 46 #endif 47 48 // 49 // the following must not be macros if we are to prefix them 50 // with std:: (they shouldn't be macros anyway...) 51 // 52 #ifdef ispunct 53 # undef ispunct 54 #endif 55 #ifdef iswpunct 56 # undef iswpunct 57 #endif 58 #ifdef isspace 59 # undef isspace 60 #endif 61 #ifdef iswspace 62 # undef iswspace 63 #endif 64 // 65 // fix namespace problems: 66 // 67 #ifdef BOOST_NO_STDC_NAMESPACE 68 namespace std{ 69 using ::ispunct; 70 using ::isspace; 71 #if !defined(BOOST_NO_CWCTYPE) 72 using ::iswpunct; 73 using ::iswspace; 74 #endif 75 } 76 #endif 77 78 namespace boost{ 79 //=========================================================================== 80 // The escaped_list_separator class. Which is a model of TokenizerFunction 81 // An escaped list is a super-set of what is commonly known as a comma 82 // separated value (csv) list.It is separated into fields by a comma or 83 // other character. If the delimiting character is inside quotes, then it is 84 // counted as a regular character.To allow for embedded quotes in a field, 85 // there can be escape sequences using the \ much like C. 86 // The role of the comma, the quotation mark, and the escape 87 // character (backslash \), can be assigned to other characters. 88 89 struct escaped_list_error : public std::runtime_error{ escaped_list_errorboost::escaped_list_error90 escaped_list_error(const std::string& what_arg):std::runtime_error(what_arg) { } 91 }; 92 93 94 // The out of the box GCC 2.95 on cygwin does not have a char_traits class. 95 // MSVC does not like the following typename 96 template <class Char, 97 class Traits = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > 98 class escaped_list_separator { 99 100 private: 101 typedef std::basic_string<Char,Traits> string_type; 102 struct char_eq { 103 Char e_; char_eqboost::escaped_list_separator::char_eq104 char_eq(Char e):e_(e) { } operator ()boost::escaped_list_separator::char_eq105 bool operator()(Char c) { 106 return Traits::eq(e_,c); 107 } 108 }; 109 string_type escape_; 110 string_type c_; 111 string_type quote_; 112 bool last_; 113 is_escape(Char e)114 bool is_escape(Char e) { 115 char_eq f(e); 116 return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end(); 117 } is_c(Char e)118 bool is_c(Char e) { 119 char_eq f(e); 120 return std::find_if(c_.begin(),c_.end(),f)!=c_.end(); 121 } is_quote(Char e)122 bool is_quote(Char e) { 123 char_eq f(e); 124 return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end(); 125 } 126 template <typename iterator, typename Token> do_escape(iterator & next,iterator end,Token & tok)127 void do_escape(iterator& next,iterator end,Token& tok) { 128 if (++next == end) 129 BOOST_THROW_EXCEPTION(escaped_list_error(std::string("cannot end with escape"))); 130 if (Traits::eq(*next,'n')) { 131 tok+='\n'; 132 return; 133 } 134 else if (is_quote(*next)) { 135 tok+=*next; 136 return; 137 } 138 else if (is_c(*next)) { 139 tok+=*next; 140 return; 141 } 142 else if (is_escape(*next)) { 143 tok+=*next; 144 return; 145 } 146 else 147 BOOST_THROW_EXCEPTION(escaped_list_error(std::string("unknown escape sequence"))); 148 } 149 150 public: 151 escaped_list_separator(Char e='\\\\',Char c=',',Char q='\\"')152 explicit escaped_list_separator(Char e = '\\', 153 Char c = ',',Char q = '\"') 154 : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { } 155 escaped_list_separator(string_type e,string_type c,string_type q)156 escaped_list_separator(string_type e, string_type c, string_type q) 157 : escape_(e), c_(c), quote_(q), last_(false) { } 158 reset()159 void reset() {last_=false;} 160 161 template <typename InputIterator, typename Token> operator ()(InputIterator & next,InputIterator end,Token & tok)162 bool operator()(InputIterator& next,InputIterator end,Token& tok) { 163 bool bInQuote = false; 164 tok = Token(); 165 166 if (next == end) { 167 if (last_) { 168 last_ = false; 169 return true; 170 } 171 else 172 return false; 173 } 174 last_ = false; 175 for (;next != end;++next) { 176 if (is_escape(*next)) { 177 do_escape(next,end,tok); 178 } 179 else if (is_c(*next)) { 180 if (!bInQuote) { 181 // If we are not in quote, then we are done 182 ++next; 183 // The last character was a c, that means there is 184 // 1 more blank field 185 last_ = true; 186 return true; 187 } 188 else tok+=*next; 189 } 190 else if (is_quote(*next)) { 191 bInQuote=!bInQuote; 192 } 193 else { 194 tok += *next; 195 } 196 } 197 return true; 198 } 199 }; 200 201 //=========================================================================== 202 // The classes here are used by offset_separator and char_separator to implement 203 // faster assigning of tokens using assign instead of += 204 205 namespace tokenizer_detail { 206 //=========================================================================== 207 // Tokenizer was broken for wide character separators, at least on Windows, since 208 // CRT functions isspace etc only expect values in [0, 0xFF]. Debug build asserts 209 // if higher values are passed in. The traits extension class should take care of this. 210 // Assuming that the conditional will always get optimized out in the function 211 // implementations, argument types are not a problem since both forms of character classifiers 212 // expect an int. 213 214 #if !defined(BOOST_NO_CWCTYPE) 215 template<typename traits, int N> 216 struct traits_extension_details : public traits { 217 typedef typename traits::char_type char_type; isspaceboost::tokenizer_detail::traits_extension_details218 static bool isspace(char_type c) 219 { 220 return std::iswspace(c) != 0; 221 } ispunctboost::tokenizer_detail::traits_extension_details222 static bool ispunct(char_type c) 223 { 224 return std::iswpunct(c) != 0; 225 } 226 }; 227 228 template<typename traits> 229 struct traits_extension_details<traits, 1> : public traits { 230 typedef typename traits::char_type char_type; isspaceboost::tokenizer_detail::traits_extension_details231 static bool isspace(char_type c) 232 { 233 return std::isspace(c) != 0; 234 } ispunctboost::tokenizer_detail::traits_extension_details235 static bool ispunct(char_type c) 236 { 237 return std::ispunct(c) != 0; 238 } 239 }; 240 #endif 241 242 243 // In case there is no cwctype header, we implement the checks manually. 244 // We make use of the fact that the tested categories should fit in ASCII. 245 template<typename traits> 246 struct traits_extension : public traits { 247 typedef typename traits::char_type char_type; isspaceboost::tokenizer_detail::traits_extension248 static bool isspace(char_type c) 249 { 250 #if !defined(BOOST_NO_CWCTYPE) 251 return traits_extension_details<traits, sizeof(char_type)>::isspace(c); 252 #else 253 return static_cast< unsigned >(c) <= 255 && std::isspace(c) != 0; 254 #endif 255 } 256 ispunctboost::tokenizer_detail::traits_extension257 static bool ispunct(char_type c) 258 { 259 #if !defined(BOOST_NO_CWCTYPE) 260 return traits_extension_details<traits, sizeof(char_type)>::ispunct(c); 261 #else 262 return static_cast< unsigned >(c) <= 255 && std::ispunct(c) != 0; 263 #endif 264 } 265 }; 266 267 // The assign_or_plus_equal struct contains functions that implement 268 // assign, +=, and clearing based on the iterator type. The 269 // generic case does nothing for plus_equal and clearing, while 270 // passing through the call for assign. 271 // 272 // When an input iterator is being used, the situation is reversed. 273 // The assign method does nothing, plus_equal invokes operator +=, 274 // and the clearing method sets the supplied token to the default 275 // token constructor's result. 276 // 277 278 template<class IteratorTag> 279 struct assign_or_plus_equal { 280 template<class Iterator, class Token> assignboost::tokenizer_detail::assign_or_plus_equal281 static void assign(Iterator b, Iterator e, Token &t) { 282 t.assign(b, e); 283 } 284 285 template<class Token, class Value> plus_equalboost::tokenizer_detail::assign_or_plus_equal286 static void plus_equal(Token &, const Value &) { } 287 288 // If we are doing an assign, there is no need for the 289 // the clear. 290 // 291 template<class Token> clearboost::tokenizer_detail::assign_or_plus_equal292 static void clear(Token &) { } 293 }; 294 295 template <> 296 struct assign_or_plus_equal<std::input_iterator_tag> { 297 template<class Iterator, class Token> assignboost::tokenizer_detail::assign_or_plus_equal298 static void assign(Iterator , Iterator , Token &) { } 299 template<class Token, class Value> plus_equalboost::tokenizer_detail::assign_or_plus_equal300 static void plus_equal(Token &t, const Value &v) { 301 t += v; 302 } 303 template<class Token> clearboost::tokenizer_detail::assign_or_plus_equal304 static void clear(Token &t) { 305 t = Token(); 306 } 307 }; 308 309 310 template<class Iterator> 311 struct pointer_iterator_category{ 312 typedef std::random_access_iterator_tag type; 313 }; 314 315 316 template<class Iterator> 317 struct class_iterator_category{ 318 typedef typename Iterator::iterator_category type; 319 }; 320 321 322 323 // This portably gets the iterator_tag without partial template specialization 324 template<class Iterator> 325 struct get_iterator_category{ 326 typedef typename mpl::if_<is_pointer<Iterator>, 327 pointer_iterator_category<Iterator>, 328 class_iterator_category<Iterator> 329 >::type cat; 330 331 typedef typename cat::type iterator_category; 332 }; 333 334 335 } // namespace tokenizer_detail 336 337 338 //=========================================================================== 339 // The offset_separator class, which is a model of TokenizerFunction. 340 // Offset breaks a string into tokens based on a range of offsets 341 342 class offset_separator { 343 private: 344 345 std::vector<int> offsets_; 346 unsigned int current_offset_; 347 bool wrap_offsets_; 348 bool return_partial_last_; 349 350 public: 351 template <typename Iter> offset_separator(Iter begin,Iter end,bool wrap_offsets=true,bool return_partial_last=true)352 offset_separator(Iter begin, Iter end, bool wrap_offsets = true, 353 bool return_partial_last = true) 354 : offsets_(begin,end), current_offset_(0), 355 wrap_offsets_(wrap_offsets), 356 return_partial_last_(return_partial_last) { } 357 offset_separator()358 offset_separator() 359 : offsets_(1,1), current_offset_(), 360 wrap_offsets_(true), return_partial_last_(true) { } 361 reset()362 void reset() { 363 current_offset_ = 0; 364 } 365 366 template <typename InputIterator, typename Token> operator ()(InputIterator & next,InputIterator end,Token & tok)367 bool operator()(InputIterator& next, InputIterator end, Token& tok) 368 { 369 typedef tokenizer_detail::assign_or_plus_equal< 370 BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category< 371 InputIterator 372 >::iterator_category 373 > assigner; 374 375 BOOST_ASSERT(!offsets_.empty()); 376 377 assigner::clear(tok); 378 InputIterator start(next); 379 380 if (next == end) 381 return false; 382 383 if (current_offset_ == offsets_.size()) 384 { 385 if (wrap_offsets_) 386 current_offset_=0; 387 else 388 return false; 389 } 390 391 int c = offsets_[current_offset_]; 392 int i = 0; 393 for (; i < c; ++i) { 394 if (next == end)break; 395 assigner::plus_equal(tok,*next++); 396 } 397 assigner::assign(start,next,tok); 398 399 if (!return_partial_last_) 400 if (i < (c-1) ) 401 return false; 402 403 ++current_offset_; 404 return true; 405 } 406 }; 407 408 409 //=========================================================================== 410 // The char_separator class breaks a sequence of characters into 411 // tokens based on the character delimiters (very much like bad old 412 // strtok). A delimiter character can either be kept or dropped. A 413 // kept delimiter shows up as an output token, whereas a dropped 414 // delimiter does not. 415 416 // This class replaces the char_delimiters_separator class. The 417 // constructor for the char_delimiters_separator class was too 418 // confusing and needed to be deprecated. However, because of the 419 // default arguments to the constructor, adding the new constructor 420 // would cause ambiguity, so instead I deprecated the whole class. 421 // The implementation of the class was also simplified considerably. 422 423 enum empty_token_policy { drop_empty_tokens, keep_empty_tokens }; 424 425 // The out of the box GCC 2.95 on cygwin does not have a char_traits class. 426 template <typename Char, 427 typename Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > 428 class char_separator 429 { 430 typedef tokenizer_detail::traits_extension<Tr> Traits; 431 typedef std::basic_string<Char,Tr> string_type; 432 public: 433 explicit char_separator(const Char * dropped_delims,const Char * kept_delims=0,empty_token_policy empty_tokens=drop_empty_tokens)434 char_separator(const Char* dropped_delims, 435 const Char* kept_delims = 0, 436 empty_token_policy empty_tokens = drop_empty_tokens) 437 : m_dropped_delims(dropped_delims), 438 m_use_ispunct(false), 439 m_use_isspace(false), 440 m_empty_tokens(empty_tokens), 441 m_output_done(false) 442 { 443 // Borland workaround 444 if (kept_delims) 445 m_kept_delims = kept_delims; 446 } 447 448 // use ispunct() for kept delimiters and isspace for dropped. 449 explicit char_separator()450 char_separator() 451 : m_use_ispunct(true), 452 m_use_isspace(true), 453 m_empty_tokens(drop_empty_tokens) { } 454 reset()455 void reset() { } 456 457 template <typename InputIterator, typename Token> operator ()(InputIterator & next,InputIterator end,Token & tok)458 bool operator()(InputIterator& next, InputIterator end, Token& tok) 459 { 460 typedef tokenizer_detail::assign_or_plus_equal< 461 BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category< 462 InputIterator 463 >::iterator_category 464 > assigner; 465 466 assigner::clear(tok); 467 468 // skip past all dropped_delims 469 if (m_empty_tokens == drop_empty_tokens) 470 for (; next != end && is_dropped(*next); ++next) 471 { } 472 473 InputIterator start(next); 474 475 if (m_empty_tokens == drop_empty_tokens) { 476 477 if (next == end) 478 return false; 479 480 481 // if we are on a kept_delims move past it and stop 482 if (is_kept(*next)) { 483 assigner::plus_equal(tok,*next); 484 ++next; 485 } else 486 // append all the non delim characters 487 for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next) 488 assigner::plus_equal(tok,*next); 489 } 490 else { // m_empty_tokens == keep_empty_tokens 491 492 // Handle empty token at the end 493 if (next == end) 494 { 495 if (m_output_done == false) 496 { 497 m_output_done = true; 498 assigner::assign(start,next,tok); 499 return true; 500 } 501 else 502 return false; 503 } 504 505 if (is_kept(*next)) { 506 if (m_output_done == false) 507 m_output_done = true; 508 else { 509 assigner::plus_equal(tok,*next); 510 ++next; 511 m_output_done = false; 512 } 513 } 514 else if (m_output_done == false && is_dropped(*next)) { 515 m_output_done = true; 516 } 517 else { 518 if (is_dropped(*next)) 519 start=++next; 520 for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next) 521 assigner::plus_equal(tok,*next); 522 m_output_done = true; 523 } 524 } 525 assigner::assign(start,next,tok); 526 return true; 527 } 528 529 private: 530 string_type m_kept_delims; 531 string_type m_dropped_delims; 532 bool m_use_ispunct; 533 bool m_use_isspace; 534 empty_token_policy m_empty_tokens; 535 bool m_output_done; 536 is_kept(Char E) const537 bool is_kept(Char E) const 538 { 539 if (m_kept_delims.length()) 540 return m_kept_delims.find(E) != string_type::npos; 541 else if (m_use_ispunct) { 542 return Traits::ispunct(E) != 0; 543 } else 544 return false; 545 } is_dropped(Char E) const546 bool is_dropped(Char E) const 547 { 548 if (m_dropped_delims.length()) 549 return m_dropped_delims.find(E) != string_type::npos; 550 else if (m_use_isspace) { 551 return Traits::isspace(E) != 0; 552 } else 553 return false; 554 } 555 }; 556 557 //=========================================================================== 558 // The following class is DEPRECATED, use class char_separators instead. 559 // 560 // The char_delimiters_separator class, which is a model of 561 // TokenizerFunction. char_delimiters_separator breaks a string 562 // into tokens based on character delimiters. There are 2 types of 563 // delimiters. returnable delimiters can be returned as 564 // tokens. These are often punctuation. nonreturnable delimiters 565 // cannot be returned as tokens. These are often whitespace 566 567 // The out of the box GCC 2.95 on cygwin does not have a char_traits class. 568 template <class Char, 569 class Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type > 570 class char_delimiters_separator { 571 private: 572 573 typedef tokenizer_detail::traits_extension<Tr> Traits; 574 typedef std::basic_string<Char,Tr> string_type; 575 string_type returnable_; 576 string_type nonreturnable_; 577 bool return_delims_; 578 bool no_ispunct_; 579 bool no_isspace_; 580 is_ret(Char E) const581 bool is_ret(Char E)const 582 { 583 if (returnable_.length()) 584 return returnable_.find(E) != string_type::npos; 585 else{ 586 if (no_ispunct_) {return false;} 587 else{ 588 int r = Traits::ispunct(E); 589 return r != 0; 590 } 591 } 592 } is_nonret(Char E) const593 bool is_nonret(Char E)const 594 { 595 if (nonreturnable_.length()) 596 return nonreturnable_.find(E) != string_type::npos; 597 else{ 598 if (no_isspace_) {return false;} 599 else{ 600 int r = Traits::isspace(E); 601 return r != 0; 602 } 603 } 604 } 605 606 public: char_delimiters_separator(bool return_delims=false,const Char * returnable=0,const Char * nonreturnable=0)607 explicit char_delimiters_separator(bool return_delims = false, 608 const Char* returnable = 0, 609 const Char* nonreturnable = 0) 610 : returnable_(returnable ? returnable : string_type().c_str()), 611 nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()), 612 return_delims_(return_delims), no_ispunct_(returnable!=0), 613 no_isspace_(nonreturnable!=0) { } 614 reset()615 void reset() { } 616 617 public: 618 619 template <typename InputIterator, typename Token> operator ()(InputIterator & next,InputIterator end,Token & tok)620 bool operator()(InputIterator& next, InputIterator end,Token& tok) { 621 tok = Token(); 622 623 // skip past all nonreturnable delims 624 // skip past the returnable only if we are not returning delims 625 for (;next!=end && ( is_nonret(*next) || (is_ret(*next) 626 && !return_delims_ ) );++next) { } 627 628 if (next == end) { 629 return false; 630 } 631 632 // if we are to return delims and we are one a returnable one 633 // move past it and stop 634 if (is_ret(*next) && return_delims_) { 635 tok+=*next; 636 ++next; 637 } 638 else 639 // append all the non delim characters 640 for (;next!=end && !is_nonret(*next) && !is_ret(*next);++next) 641 tok+=*next; 642 643 644 return true; 645 } 646 }; 647 648 649 } //namespace boost 650 651 #endif 652