1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_STRINGS_STRING_TOKENIZER_H_ 6 #define BASE_STRINGS_STRING_TOKENIZER_H_ 7 8 #include <algorithm> 9 #include <string> 10 11 #include "base/strings/string_piece.h" 12 13 namespace base { 14 15 // StringTokenizerT is a simple string tokenizer class. It works like an 16 // iterator that with each step (see the Advance method) updates members that 17 // refer to the next token in the input string. The user may optionally 18 // configure the tokenizer to return delimiters. 19 // 20 // EXAMPLE 1: 21 // 22 // char input[] = "this is a test"; 23 // CStringTokenizer t(input, input + strlen(input), " "); 24 // while (t.GetNext()) { 25 // printf("%s\n", t.token().c_str()); 26 // } 27 // 28 // Output: 29 // 30 // this 31 // is 32 // a 33 // test 34 // 35 // 36 // EXAMPLE 2: 37 // 38 // std::string input = "no-cache=\"foo, bar\", private"; 39 // StringTokenizer t(input, ", "); 40 // t.set_quote_chars("\""); 41 // while (t.GetNext()) { 42 // printf("%s\n", t.token().c_str()); 43 // } 44 // 45 // Output: 46 // 47 // no-cache="foo, bar" 48 // private 49 // 50 // 51 // EXAMPLE 3: 52 // 53 // bool next_is_option = false, next_is_value = false; 54 // std::string input = "text/html; charset=UTF-8; foo=bar"; 55 // StringTokenizer t(input, "; ="); 56 // t.set_options(StringTokenizer::RETURN_DELIMS); 57 // while (t.GetNext()) { 58 // if (t.token_is_delim()) { 59 // switch (*t.token_begin()) { 60 // case ';': 61 // next_is_option = true; 62 // break; 63 // case '=': 64 // next_is_value = true; 65 // break; 66 // } 67 // } else { 68 // const char* label; 69 // if (next_is_option) { 70 // label = "option-name"; 71 // next_is_option = false; 72 // } else if (next_is_value) { 73 // label = "option-value"; 74 // next_is_value = false; 75 // } else { 76 // label = "mime-type"; 77 // } 78 // printf("%s: %s\n", label, t.token().c_str()); 79 // } 80 // } 81 // 82 // 83 template <class str, class const_iterator> 84 class StringTokenizerT { 85 public: 86 typedef typename str::value_type char_type; 87 88 // Options that may be pass to set_options() 89 enum { 90 // Specifies the delimiters should be returned as tokens 91 RETURN_DELIMS = 1 << 0, 92 93 // Specifies that empty tokens should be returned. Treats the beginning and 94 // ending of the string as implicit delimiters, though doesn't return them 95 // as tokens if RETURN_DELIMS is also used. 96 RETURN_EMPTY_TOKENS = 1 << 1, 97 }; 98 99 // The string object must live longer than the tokenizer. In particular, this 100 // should not be constructed with a temporary. The deleted rvalue constructor 101 // blocks the most obvious instances of this (e.g. passing a string literal to 102 // the constructor), but caution must still be exercised. StringTokenizerT(const str & string,const str & delims)103 StringTokenizerT(const str& string, 104 const str& delims) { 105 Init(string.begin(), string.end(), delims); 106 } 107 108 // Don't allow temporary strings to be used with string tokenizer, since 109 // Init() would otherwise save iterators to a temporary string. 110 StringTokenizerT(str&&, const str& delims) = delete; 111 StringTokenizerT(const_iterator string_begin,const_iterator string_end,const str & delims)112 StringTokenizerT(const_iterator string_begin, 113 const_iterator string_end, 114 const str& delims) { 115 Init(string_begin, string_end, delims); 116 } 117 118 // Set the options for this tokenizer. By default, this is 0. set_options(int options)119 void set_options(int options) { options_ = options; } 120 121 // Set the characters to regard as quotes. By default, this is empty. When 122 // a quote char is encountered, the tokenizer will switch into a mode where 123 // it ignores delimiters that it finds. It switches out of this mode once it 124 // finds another instance of the quote char. If a backslash is encountered 125 // within a quoted string, then the next character is skipped. set_quote_chars(const str & quotes)126 void set_quote_chars(const str& quotes) { quotes_ = quotes; } 127 128 // Call this method to advance the tokenizer to the next delimiter. This 129 // returns false if the tokenizer is complete. This method must be called 130 // before calling any of the token* methods. GetNext()131 bool GetNext() { 132 if (quotes_.empty() && options_ == 0) 133 return QuickGetNext(); 134 else 135 return FullGetNext(); 136 } 137 138 // Start iterating through tokens from the beginning of the string. Reset()139 void Reset() { 140 token_end_ = start_pos_; 141 } 142 143 // Returns true if token is a delimiter. When the tokenizer is constructed 144 // with the RETURN_DELIMS option, this method can be used to check if the 145 // returned token is actually a delimiter. Returns true before the first 146 // time GetNext() has been called, and after GetNext() returns false. token_is_delim()147 bool token_is_delim() const { return token_is_delim_; } 148 149 // If GetNext() returned true, then these methods may be used to read the 150 // value of the token. token_begin()151 const_iterator token_begin() const { return token_begin_; } token_end()152 const_iterator token_end() const { return token_end_; } token()153 str token() const { return str(token_begin_, token_end_); } token_piece()154 BasicStringPiece<str> token_piece() const { 155 return BasicStringPiece<str>(&*token_begin_, 156 std::distance(token_begin_, token_end_)); 157 } 158 159 private: Init(const_iterator string_begin,const_iterator string_end,const str & delims)160 void Init(const_iterator string_begin, 161 const_iterator string_end, 162 const str& delims) { 163 start_pos_ = string_begin; 164 token_begin_ = string_begin; 165 token_end_ = string_begin; 166 end_ = string_end; 167 delims_ = delims; 168 options_ = 0; 169 token_is_delim_ = true; 170 } 171 172 // Implementation of GetNext() for when we have no quote characters. We have 173 // two separate implementations because AdvanceOne() is a hot spot in large 174 // text files with large tokens. QuickGetNext()175 bool QuickGetNext() { 176 token_is_delim_ = false; 177 for (;;) { 178 token_begin_ = token_end_; 179 if (token_end_ == end_) { 180 token_is_delim_ = true; 181 return false; 182 } 183 ++token_end_; 184 if (delims_.find(*token_begin_) == str::npos) 185 break; 186 // else skip over delimiter. 187 } 188 while (token_end_ != end_ && delims_.find(*token_end_) == str::npos) 189 ++token_end_; 190 return true; 191 } 192 193 // Implementation of GetNext() for when we have to take quotes into account. FullGetNext()194 bool FullGetNext() { 195 AdvanceState state; 196 197 for (;;) { 198 if (token_is_delim_) { 199 // Last token was a delimiter. Note: This is also the case at the start. 200 // 201 // ... D T T T T D ... 202 // ^ ^ 203 // | | 204 // | |token_end_| : The next character to look at or |end_|. 205 // | 206 // |token_begin_| : Points to delimiter or |token_end_|. 207 // 208 // The next token is always a non-delimiting token. It could be empty, 209 // however. 210 token_is_delim_ = false; 211 token_begin_ = token_end_; 212 213 // Slurp all non-delimiter characters into the token. 214 while (token_end_ != end_ && AdvanceOne(&state, *token_end_)) { 215 ++token_end_; 216 } 217 218 // If it's non-empty, or empty tokens were requested, return the token. 219 if (token_begin_ != token_end_ || (options_ & RETURN_EMPTY_TOKENS)) 220 return true; 221 } 222 223 DCHECK(!token_is_delim_); 224 // Last token was a regular token. 225 // 226 // ... T T T D T T ... 227 // ^ ^ 228 // | | 229 // | token_end_ : The next character to look at. Always one 230 // | char beyond the token boundary. 231 // | 232 // token_begin_ : Points to beginning of token. Note: token could 233 // be empty, in which case 234 // token_begin_ == token_end_. 235 // 236 // The next token is always a delimiter. It could be |end_| however, but 237 // |end_| is also an implicit delimiter. 238 token_is_delim_ = true; 239 token_begin_ = token_end_; 240 241 if (token_end_ == end_) 242 return false; 243 244 // Look at the delimiter. 245 ++token_end_; 246 if (options_ & RETURN_DELIMS) 247 return true; 248 } 249 250 return false; 251 } 252 IsDelim(char_type c)253 bool IsDelim(char_type c) const { 254 return delims_.find(c) != str::npos; 255 } 256 IsQuote(char_type c)257 bool IsQuote(char_type c) const { 258 return quotes_.find(c) != str::npos; 259 } 260 261 struct AdvanceState { 262 bool in_quote; 263 bool in_escape; 264 char_type quote_char; AdvanceStateAdvanceState265 AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {} 266 }; 267 268 // Returns true if a delimiter was not hit. AdvanceOne(AdvanceState * state,char_type c)269 bool AdvanceOne(AdvanceState* state, char_type c) { 270 if (state->in_quote) { 271 if (state->in_escape) { 272 state->in_escape = false; 273 } else if (c == '\\') { 274 state->in_escape = true; 275 } else if (c == state->quote_char) { 276 state->in_quote = false; 277 } 278 } else { 279 if (IsDelim(c)) 280 return false; 281 state->in_quote = IsQuote(state->quote_char = c); 282 } 283 return true; 284 } 285 286 const_iterator start_pos_; 287 const_iterator token_begin_; 288 const_iterator token_end_; 289 const_iterator end_; 290 str delims_; 291 str quotes_; 292 int options_; 293 bool token_is_delim_; 294 }; 295 296 typedef StringTokenizerT<std::string, std::string::const_iterator> 297 StringTokenizer; 298 typedef StringTokenizerT<string16, string16::const_iterator> String16Tokenizer; 299 typedef StringTokenizerT<std::string, const char*> CStringTokenizer; 300 301 } // namespace base 302 303 #endif // BASE_STRINGS_STRING_TOKENIZER_H_ 304