1 // string_token.hpp 2 // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See accompanying 5 // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 6 #ifndef BOOST_LEXER_STRING_TOKEN_HPP 7 #define BOOST_LEXER_STRING_TOKEN_HPP 8 9 #include <algorithm> 10 #include "size_t.hpp" 11 #include "consts.hpp" // num_chars, num_wchar_ts 12 #include <string> 13 #include <limits> 14 15 namespace boost 16 { 17 namespace lexer 18 { 19 template<typename CharT> 20 struct basic_string_token 21 { 22 typedef std::basic_string<CharT> string; 23 24 bool _negated; 25 string _charset; 26 basic_string_tokenboost::lexer::basic_string_token27 basic_string_token () : 28 _negated (false) 29 { 30 } 31 basic_string_tokenboost::lexer::basic_string_token32 basic_string_token (const bool negated_, const string &charset_) : 33 _negated (negated_), 34 _charset (charset_) 35 { 36 } 37 remove_duplicatesboost::lexer::basic_string_token38 void remove_duplicates () 39 { 40 const CharT *start_ = _charset.c_str (); 41 const CharT *end_ = start_ + _charset.size (); 42 43 // Optimisation for very large charsets: 44 // sorting via pointers is much quicker than 45 // via iterators... 46 std::sort (const_cast<CharT *> (start_), const_cast<CharT *> (end_)); 47 _charset.erase (std::unique (_charset.begin (), _charset.end ()), 48 _charset.end ()); 49 } 50 normaliseboost::lexer::basic_string_token51 void normalise () 52 { 53 const std::size_t max_chars_ = sizeof (CharT) == 1 ? 54 num_chars : num_wchar_ts; 55 56 if (_charset.length () == max_chars_) 57 { 58 _negated = !_negated; 59 _charset.clear (); 60 } 61 else if (_charset.length () > max_chars_ / 2) 62 { 63 negate (); 64 } 65 } 66 negateboost::lexer::basic_string_token67 void negate () 68 { 69 const std::size_t max_chars_ = sizeof (CharT) == 1 ? 70 num_chars : num_wchar_ts; 71 CharT curr_char_ = (std::numeric_limits<CharT>::min)(); 72 string temp_; 73 const CharT *curr_ = _charset.c_str (); 74 const CharT *chars_end_ = curr_ + _charset.size (); 75 76 _negated = !_negated; 77 temp_.resize (max_chars_ - _charset.size ()); 78 79 CharT *ptr_ = const_cast<CharT *> (temp_.c_str ()); 80 std::size_t i_ = 0; 81 82 while (curr_ < chars_end_) 83 { 84 while (*curr_ > curr_char_) 85 { 86 *ptr_ = curr_char_; 87 ++ptr_; 88 ++curr_char_; 89 ++i_; 90 } 91 92 ++curr_char_; 93 ++curr_; 94 ++i_; 95 } 96 97 for (; i_ < max_chars_; ++i_) 98 { 99 *ptr_ = curr_char_; 100 ++ptr_; 101 ++curr_char_; 102 } 103 104 _charset = temp_; 105 } 106 operator <boost::lexer::basic_string_token107 bool operator < (const basic_string_token &rhs_) const 108 { 109 return _negated < rhs_._negated || 110 (_negated == rhs_._negated && _charset < rhs_._charset); 111 } 112 emptyboost::lexer::basic_string_token113 bool empty () const 114 { 115 return _charset.empty () && !_negated; 116 } 117 anyboost::lexer::basic_string_token118 bool any () const 119 { 120 return _charset.empty () && _negated; 121 } 122 clearboost::lexer::basic_string_token123 void clear () 124 { 125 _negated = false; 126 _charset.clear (); 127 } 128 intersectboost::lexer::basic_string_token129 void intersect (basic_string_token &rhs_, basic_string_token &overlap_) 130 { 131 if ((any () && rhs_.any ()) || (_negated == rhs_._negated && 132 !any () && !rhs_.any ())) 133 { 134 intersect_same_types (rhs_, overlap_); 135 } 136 else 137 { 138 intersect_diff_types (rhs_, overlap_); 139 } 140 } 141 escape_charboost::lexer::basic_string_token142 static void escape_char (const CharT ch_, string &out_) 143 { 144 switch (ch_) 145 { 146 case '\0': 147 out_ += '\\'; 148 out_ += '0'; 149 break; 150 case '\a': 151 out_ += '\\'; 152 out_ += 'a'; 153 break; 154 case '\b': 155 out_ += '\\'; 156 out_ += 'b'; 157 break; 158 case 27: 159 out_ += '\\'; 160 out_ += 'x'; 161 out_ += '1'; 162 out_ += 'b'; 163 break; 164 case '\f': 165 out_ += '\\'; 166 out_ += 'f'; 167 break; 168 case '\n': 169 out_ += '\\'; 170 out_ += 'n'; 171 break; 172 case '\r': 173 out_ += '\\'; 174 out_ += 'r'; 175 break; 176 case '\t': 177 out_ += '\\'; 178 out_ += 't'; 179 break; 180 case '\v': 181 out_ += '\\'; 182 out_ += 'v'; 183 break; 184 case '\\': 185 out_ += '\\'; 186 out_ += '\\'; 187 break; 188 case '"': 189 out_ += '\\'; 190 out_ += '"'; 191 break; 192 case '\'': 193 out_ += '\\'; 194 out_ += '\''; 195 break; 196 default: 197 { 198 if (ch_ < 32 && ch_ >= 0) 199 { 200 std::basic_stringstream<CharT> ss_; 201 202 out_ += '\\'; 203 out_ += 'x'; 204 ss_ << std::hex << 205 static_cast<std::size_t> (ch_); 206 out_ += ss_.str (); 207 } 208 else 209 { 210 out_ += ch_; 211 } 212 213 break; 214 } 215 } 216 } 217 218 private: intersect_same_typesboost::lexer::basic_string_token219 void intersect_same_types (basic_string_token &rhs_, 220 basic_string_token &overlap_) 221 { 222 if (any ()) 223 { 224 clear (); 225 overlap_._negated = true; 226 rhs_.clear (); 227 } 228 else 229 { 230 typename string::iterator iter_ = _charset.begin (); 231 typename string::iterator end_ = _charset.end (); 232 typename string::iterator rhs_iter_ = rhs_._charset.begin (); 233 typename string::iterator rhs_end_ = rhs_._charset.end (); 234 235 overlap_._negated = _negated; 236 237 while (iter_ != end_ && rhs_iter_ != rhs_end_) 238 { 239 if (*iter_ < *rhs_iter_) 240 { 241 ++iter_; 242 } 243 else if (*iter_ > *rhs_iter_) 244 { 245 ++rhs_iter_; 246 } 247 else 248 { 249 overlap_._charset += *iter_; 250 iter_ = _charset.erase (iter_); 251 end_ = _charset.end (); 252 rhs_iter_ = rhs_._charset.erase (rhs_iter_); 253 rhs_end_ = rhs_._charset.end (); 254 } 255 } 256 257 if (_negated) 258 { 259 // duplicates already merged, so safe to merge 260 // using std lib. 261 262 // src, dest 263 merge (_charset, overlap_._charset); 264 // duplicates already merged, so safe to merge 265 // using std lib. 266 267 // src, dest 268 merge (rhs_._charset, overlap_._charset); 269 _negated = false; 270 rhs_._negated = false; 271 std::swap (_charset, rhs_._charset); 272 normalise (); 273 overlap_.normalise (); 274 rhs_.normalise (); 275 } 276 else if (!overlap_._charset.empty ()) 277 { 278 normalise (); 279 overlap_.normalise (); 280 rhs_.normalise (); 281 } 282 } 283 } 284 intersect_diff_typesboost::lexer::basic_string_token285 void intersect_diff_types (basic_string_token &rhs_, 286 basic_string_token &overlap_) 287 { 288 if (any ()) 289 { 290 intersect_any (rhs_, overlap_); 291 } 292 else if (_negated) 293 { 294 intersect_negated (rhs_, overlap_); 295 } 296 else // _negated == false 297 { 298 intersect_charset (rhs_, overlap_); 299 } 300 } 301 intersect_anyboost::lexer::basic_string_token302 void intersect_any (basic_string_token &rhs_, basic_string_token &overlap_) 303 { 304 if (rhs_._negated) 305 { 306 rhs_.intersect_negated (*this, overlap_); 307 } 308 else // rhs._negated == false 309 { 310 rhs_.intersect_charset (*this, overlap_); 311 } 312 } 313 intersect_negatedboost::lexer::basic_string_token314 void intersect_negated (basic_string_token &rhs_, 315 basic_string_token &overlap_) 316 { 317 if (rhs_.any ()) 318 { 319 overlap_._negated = true; 320 overlap_._charset = _charset; 321 rhs_._negated = false; 322 rhs_._charset = _charset; 323 clear (); 324 } 325 else // rhs._negated == false 326 { 327 rhs_.intersect_charset (*this, overlap_); 328 } 329 } 330 intersect_charsetboost::lexer::basic_string_token331 void intersect_charset (basic_string_token &rhs_, 332 basic_string_token &overlap_) 333 { 334 if (rhs_.any ()) 335 { 336 overlap_._charset = _charset; 337 rhs_._negated = true; 338 rhs_._charset = _charset; 339 clear (); 340 } 341 else // rhs_._negated == true 342 { 343 typename string::iterator iter_ = _charset.begin (); 344 typename string::iterator end_ = _charset.end (); 345 typename string::iterator rhs_iter_ = rhs_._charset.begin (); 346 typename string::iterator rhs_end_ = rhs_._charset.end (); 347 348 while (iter_ != end_ && rhs_iter_ != rhs_end_) 349 { 350 if (*iter_ < *rhs_iter_) 351 { 352 overlap_._charset += *iter_; 353 rhs_iter_ = rhs_._charset.insert (rhs_iter_, *iter_); 354 ++rhs_iter_; 355 rhs_end_ = rhs_._charset.end (); 356 iter_ = _charset.erase (iter_); 357 end_ = _charset.end (); 358 } 359 else if (*iter_ > *rhs_iter_) 360 { 361 ++rhs_iter_; 362 } 363 else 364 { 365 ++iter_; 366 ++rhs_iter_; 367 } 368 } 369 370 if (iter_ != end_) 371 { 372 // nothing bigger in rhs_ than iter_, 373 // so safe to merge using std lib. 374 string temp_ (iter_, end_); 375 376 // src, dest 377 merge (temp_, overlap_._charset); 378 _charset.erase (iter_, end_); 379 } 380 381 if (!overlap_._charset.empty ()) 382 { 383 merge (overlap_._charset, rhs_._charset); 384 // possible duplicates, so check for any and erase. 385 rhs_._charset.erase (std::unique (rhs_._charset.begin (), 386 rhs_._charset.end ()), rhs_._charset.end ()); 387 normalise (); 388 overlap_.normalise (); 389 rhs_.normalise (); 390 } 391 } 392 } 393 mergeboost::lexer::basic_string_token394 void merge (string &src_, string &dest_) 395 { 396 string tmp_ (src_.size () + dest_.size (), 0); 397 398 std::merge (src_.begin (), src_.end (), dest_.begin (), dest_.end (), 399 tmp_.begin ()); 400 dest_ = tmp_; 401 } 402 }; 403 } 404 } 405 406 #endif 407