1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef Tokenizer_h__ 8 #define Tokenizer_h__ 9 10 #include "nsString.h" 11 #include "mozilla/CheckedInt.h" 12 #include "mozilla/ScopeExit.h" 13 #include "mozilla/TypeTraits.h" 14 #include "mozilla/UniquePtr.h" 15 #include "nsTArray.h" 16 17 namespace mozilla { 18 19 class TokenizerBase { 20 public: 21 /** 22 * The analyzer works with elements in the input cut to a sequence of token 23 * where each token has an elementary type 24 */ 25 enum TokenType : uint32_t { 26 TOKEN_UNKNOWN, 27 TOKEN_RAW, 28 TOKEN_ERROR, 29 TOKEN_INTEGER, 30 TOKEN_WORD, 31 TOKEN_CHAR, 32 TOKEN_WS, 33 TOKEN_EOL, 34 TOKEN_EOF, 35 TOKEN_CUSTOM0 = 1000 36 }; 37 38 enum ECaseSensitivity { CASE_SENSITIVE, CASE_INSENSITIVE }; 39 40 /** 41 * Class holding the type and the value of a token. It can be manually 42 * created to allow checks against it via methods of Tokenizer or are results 43 * of some of the Tokenizer's methods. 44 */ 45 class Token { 46 TokenType mType; 47 nsDependentCSubstring mWord; 48 nsCString mCustom; 49 char mChar; 50 uint64_t mInteger; 51 ECaseSensitivity mCustomCaseInsensitivity; 52 bool mCustomEnabled; 53 54 // If this token is a result of the parsing process, this member is 55 // referencing a sub-string in the input buffer. If this is externally 56 // created Token this member is left an empty string. 57 nsDependentCSubstring mFragment; 58 59 friend class TokenizerBase; 60 void AssignFragment(nsACString::const_char_iterator begin, 61 nsACString::const_char_iterator end); 62 63 static Token Raw(); 64 65 public: 66 Token(); 67 Token(const Token& aOther); 68 Token& operator=(const Token& aOther); 69 70 // Static constructors of tokens by type and value 71 static Token Word(const nsACString& aWord); 72 static Token Char(const char aChar); 73 static Token Number(const uint64_t aNumber); 74 static Token Whitespace(); 75 static Token NewLine(); 76 static Token EndOfFile(); 77 static Token Error(); 78 79 // Compares the two tokens, type must be identical and value 80 // of one of the tokens must be 'any' or equal. 81 bool Equals(const Token& aOther) const; 82 Type()83 TokenType Type() const { return mType; } 84 char AsChar() const; 85 nsDependentCSubstring AsString() const; 86 uint64_t AsInteger() const; 87 Fragment()88 nsDependentCSubstring Fragment() const { return mFragment; } 89 }; 90 91 /** 92 * Consumers may register a custom string that, when found in the input, is 93 * considered a token and returned by Next*() and accepted by Check*() 94 * methods. AddCustomToken() returns a reference to a token that can then be 95 * comapred using Token::Equals() againts the output from Next*() or be passed 96 * to Check*(). 97 */ 98 Token AddCustomToken(const nsACString& aValue, 99 ECaseSensitivity aCaseInsensitivity, 100 bool aEnabled = true); 101 template <uint32_t N> 102 Token AddCustomToken(const char (&aValue)[N], 103 ECaseSensitivity aCaseInsensitivity, 104 bool aEnabled = true) { 105 return AddCustomToken(nsDependentCSubstring(aValue, N - 1), 106 aCaseInsensitivity, aEnabled); 107 } 108 void RemoveCustomToken(Token& aToken); 109 /** 110 * Only applies to a custom type of a Token (see AddCustomToken above.) 111 * This turns on and off token recognition. When a custom token is disabled, 112 * it's ignored as never added as a custom token. 113 */ 114 void EnableCustomToken(Token const& aToken, bool aEnable); 115 116 /** 117 * Mode of tokenization. 118 * FULL tokenization, the default, recognizes built-in tokens and any custom 119 * tokens, if added. CUSTOM_ONLY will only recognize custom tokens, the rest 120 * is seen as 'raw'. This mode can be understood as a 'binary' mode. 121 */ 122 enum class Mode { FULL, CUSTOM_ONLY }; 123 void SetTokenizingMode(Mode aMode); 124 125 /** 126 * Return false iff the last Check*() call has returned false or when we've 127 * read past the end of the input string. 128 */ 129 MOZ_MUST_USE bool HasFailed() const; 130 131 protected: 132 explicit TokenizerBase(const char* aWhitespaces = nullptr, 133 const char* aAdditionalWordChars = nullptr); 134 135 // false if we have already read the EOF token. 136 bool HasInput() const; 137 // Main parsing function, it doesn't shift the read cursor, just returns the 138 // next token position. 139 nsACString::const_char_iterator Parse(Token& aToken) const; 140 // Is read cursor at the end? 141 bool IsEnd(const nsACString::const_char_iterator& caret) const; 142 // True, when we are at the end of the input data, but it has not been marked 143 // as complete yet. In that case we cannot proceed with providing a 144 // multi-char token. 145 bool IsPending(const nsACString::const_char_iterator& caret) const; 146 // Is read cursor on a character that is a word start? 147 bool IsWordFirst(const char aInput) const; 148 // Is read cursor on a character that is an in-word letter? 149 bool IsWord(const char aInput) const; 150 // Is read cursor on a character that is a valid number? 151 // TODO - support multiple radix 152 bool IsNumber(const char aInput) const; 153 // Is equal to the given custom token? 154 bool IsCustom(const nsACString::const_char_iterator& caret, 155 const Token& aCustomToken, uint32_t* aLongest = nullptr) const; 156 157 // Friendly helper to assign a fragment on a Token 158 static void AssignFragment(Token& aToken, 159 nsACString::const_char_iterator begin, 160 nsACString::const_char_iterator end); 161 162 // true iff we have already read the EOF token 163 bool mPastEof; 164 // true iff the last Check*() call has returned false, reverts to true on 165 // Rollback() call 166 bool mHasFailed; 167 // true if the input string is final (finished), false when we expect more 168 // data yet to be fed to the tokenizer (see IncrementalTokenizer derived 169 // class). 170 bool mInputFinished; 171 // custom only vs full tokenizing mode, see the Parse() method 172 Mode mMode; 173 // minimal raw data chunked delivery during incremental feed 174 uint32_t mMinRawDelivery; 175 176 // Customizable list of whitespaces 177 const char* mWhitespaces; 178 // Additinal custom word characters 179 const char* mAdditionalWordChars; 180 181 // All these point to the original buffer passed to the constructor or to the 182 // incremental buffer after FeedInput. 183 nsACString::const_char_iterator 184 mCursor; // Position of the current (actually next to read) token start 185 nsACString::const_char_iterator mEnd; // End of the input position 186 187 // This is the list of tokens user has registered with AddCustomToken() 188 nsTArray<UniquePtr<Token>> mCustomTokens; 189 uint32_t mNextCustomTokenID; 190 191 private: 192 TokenizerBase() = delete; 193 TokenizerBase(const TokenizerBase&) = delete; 194 TokenizerBase(TokenizerBase&&) = delete; 195 TokenizerBase(const TokenizerBase&&) = delete; 196 TokenizerBase& operator=(const TokenizerBase&) = delete; 197 }; 198 199 /** 200 * This is a simple implementation of a lexical analyzer or maybe better 201 * called a tokenizer. It doesn't allow any user dictionaries or 202 * user define token types. 203 * 204 * It is limited only to ASCII input for now. UTF-8 or any other input 205 * encoding must yet be implemented. 206 */ 207 class Tokenizer : public TokenizerBase { 208 public: 209 /** 210 * @param aSource 211 * The string to parse. 212 * IMPORTANT NOTE: Tokenizer doesn't ensure the input string buffer 213 * lifetime. It's up to the consumer to make sure the string's buffer outlives 214 * the Tokenizer! 215 * @param aWhitespaces 216 * If non-null Tokenizer will use this custom set of whitespaces for 217 * CheckWhite() and SkipWhites() calls. By default the list consists of space 218 * and tab. 219 * @param aAdditionalWordChars 220 * If non-null it will be added to the list of characters that consist a 221 * word. This is useful when you want to accept e.g. '-' in HTTP headers. By 222 * default a word character is consider any character for which upper case 223 * is different from lower case. 224 * 225 * If there is an overlap between aWhitespaces and aAdditionalWordChars, the 226 * check for word characters is made first. 227 */ 228 explicit Tokenizer(const nsACString& aSource, 229 const char* aWhitespaces = nullptr, 230 const char* aAdditionalWordChars = nullptr); 231 explicit Tokenizer(const char* aSource, const char* aWhitespaces = nullptr, 232 const char* aAdditionalWordChars = nullptr); 233 234 /** 235 * When there is still anything to read from the input, tokenize it, store the 236 * token type and value to aToken result and shift the cursor past this just 237 * parsed token. Each call to Next() reads another token from the input and 238 * shifts the cursor. Returns false if we have passed the end of the input. 239 */ 240 MOZ_MUST_USE 241 bool Next(Token& aToken); 242 243 /** 244 * Parse the token on the input read cursor position, check its type is equal 245 * to aTokenType and if so, put it into aResult, shift the cursor and return 246 * true. Otherwise, leave the input read cursor position intact and return 247 * false. 248 */ 249 MOZ_MUST_USE 250 bool Check(const TokenType aTokenType, Token& aResult); 251 /** 252 * Same as above method, just compares both token type and token value passed 253 * in aToken. When both the type and the value equals, shift the cursor and 254 * return true. Otherwise return false. 255 */ 256 MOZ_MUST_USE 257 bool Check(const Token& aToken); 258 259 /** 260 * SkipWhites method (below) may also skip new line characters automatically. 261 */ 262 enum WhiteSkipping { 263 /** 264 * SkipWhites will only skip what is defined as a white space (default). 265 */ 266 DONT_INCLUDE_NEW_LINE = 0, 267 /** 268 * SkipWhites will skip definited white spaces as well as new lines 269 * automatically. 270 */ 271 INCLUDE_NEW_LINE = 1 272 }; 273 274 /** 275 * Skips any occurence of whitespaces specified in mWhitespaces member, 276 * optionally skip also new lines. 277 */ 278 void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE); 279 280 /** 281 * Skips all tokens until the given one is found or EOF is hit. The token 282 * or EOF are next to read. 283 */ 284 void SkipUntil(Token const& aToken); 285 286 // These are mostly shortcuts for the Check() methods above. 287 288 /** 289 * Check whitespace character is present. 290 */ 291 MOZ_MUST_USE CheckWhite()292 bool CheckWhite() { return Check(Token::Whitespace()); } 293 /** 294 * Check there is a single character on the read cursor position. If so, 295 * shift the read cursor position and return true. Otherwise false. 296 */ 297 MOZ_MUST_USE CheckChar(const char aChar)298 bool CheckChar(const char aChar) { return Check(Token::Char(aChar)); } 299 /** 300 * This is a customizable version of CheckChar. aClassifier is a function 301 * called with value of the character on the current input read position. If 302 * this user function returns true, read cursor is shifted and true returned. 303 * Otherwise false. The user classifiction function is not called when we are 304 * at or past the end and false is immediately returned. 305 */ 306 MOZ_MUST_USE 307 bool CheckChar(bool (*aClassifier)(const char aChar)); 308 /** 309 * Check for a whole expected word. 310 */ 311 MOZ_MUST_USE CheckWord(const nsACString & aWord)312 bool CheckWord(const nsACString& aWord) { return Check(Token::Word(aWord)); } 313 /** 314 * Shortcut for literal const word check with compile time length calculation. 315 */ 316 template <uint32_t N> CheckWord(const char (& aWord)[N])317 MOZ_MUST_USE bool CheckWord(const char (&aWord)[N]) { 318 return Check(Token::Word(nsDependentCString(aWord, N - 1))); 319 } 320 /** 321 * Checks \r, \n or \r\n. 322 */ 323 MOZ_MUST_USE CheckEOL()324 bool CheckEOL() { return Check(Token::NewLine()); } 325 /** 326 * Checks we are at the end of the input string reading. If so, shift past 327 * the end and returns true. Otherwise does nothing and returns false. 328 */ 329 MOZ_MUST_USE CheckEOF()330 bool CheckEOF() { return Check(Token::EndOfFile()); } 331 332 /** 333 * These are shortcuts to obtain the value immediately when the token type 334 * matches. 335 */ 336 MOZ_MUST_USE bool ReadChar(char* aValue); 337 MOZ_MUST_USE bool ReadChar(bool (*aClassifier)(const char aChar), 338 char* aValue); 339 MOZ_MUST_USE bool ReadWord(nsACString& aValue); 340 MOZ_MUST_USE bool ReadWord(nsDependentCSubstring& aValue); 341 342 /** 343 * This is an integer read helper. It returns false and doesn't move the read 344 * cursor when any of the following happens: 345 * - the token at the read cursor is not an integer 346 * - the final number doesn't fit the T type 347 * Otherwise true is returned, aValue is filled with the integral number 348 * and the cursor is moved forward. 349 */ 350 template <typename T> ReadInteger(T * aValue)351 MOZ_MUST_USE bool ReadInteger(T* aValue) { 352 MOZ_RELEASE_ASSERT(aValue); 353 354 nsACString::const_char_iterator rollback = mRollback; 355 nsACString::const_char_iterator cursor = mCursor; 356 Token t; 357 if (!Check(TOKEN_INTEGER, t)) { 358 return false; 359 } 360 361 mozilla::CheckedInt<T> checked(t.AsInteger()); 362 if (!checked.isValid()) { 363 // Move to a state as if Check() call has failed 364 mRollback = rollback; 365 mCursor = cursor; 366 mHasFailed = true; 367 return false; 368 } 369 370 *aValue = checked.value(); 371 return true; 372 } 373 374 /** 375 * Same as above, but accepts an integer with an optional minus sign. 376 */ 377 template <typename T, typename V = typename EnableIf< 378 IsSigned<typename RemovePointer<T>::Type>::value, 379 typename RemovePointer<T>::Type>::Type> ReadSignedInteger(T * aValue)380 MOZ_MUST_USE bool ReadSignedInteger(T* aValue) { 381 MOZ_RELEASE_ASSERT(aValue); 382 383 nsACString::const_char_iterator rollback = mRollback; 384 nsACString::const_char_iterator cursor = mCursor; 385 auto revert = MakeScopeExit([&] { 386 // Move to a state as if Check() call has failed 387 mRollback = rollback; 388 mCursor = cursor; 389 mHasFailed = true; 390 }); 391 392 // Using functional raw access because '-' could be part of the word set 393 // making CheckChar('-') not work. 394 bool minus = CheckChar([](const char aChar) { return aChar == '-'; }); 395 396 Token t; 397 if (!Check(TOKEN_INTEGER, t)) { 398 return false; 399 } 400 401 mozilla::CheckedInt<T> checked(t.AsInteger()); 402 if (minus) { 403 checked *= -1; 404 } 405 406 if (!checked.isValid()) { 407 return false; 408 } 409 410 *aValue = checked.value(); 411 revert.release(); 412 return true; 413 } 414 415 /** 416 * Returns the read cursor position back as it was before the last call of any 417 * parsing method of Tokenizer (Next, Check*, Skip*, Read*) so that the last 418 * operation can be repeated. Rollback cannot be used multiple times, it only 419 * reverts the last successfull parse operation. It also cannot be used 420 * before any parsing operation has been called on the Tokenizer. 421 */ 422 void Rollback(); 423 424 /** 425 * Record() and Claim() are collecting the input as it is being parsed to 426 * obtain a substring between particular syntax bounderies defined by any 427 * recursive descent parser or simple parser the Tokenizer is used to read the 428 * input for. Inlucsion of a token that has just been parsed can be controlled 429 * using an arguemnt. 430 */ 431 enum ClaimInclusion { 432 /** 433 * Include resulting (or passed) token of the last lexical analyzer 434 * operation in the result. 435 */ 436 INCLUDE_LAST, 437 /** 438 * Do not include it. 439 */ 440 EXCLUDE_LAST 441 }; 442 443 /** 444 * Start the process of recording. Based on aInclude value the begining of 445 * the recorded sub-string is at the current position (EXCLUDE_LAST) or at the 446 * position before the last parsed token (INCLUDE_LAST). 447 */ 448 void Record(ClaimInclusion aInclude = EXCLUDE_LAST); 449 /** 450 * Claim result of the record started with Record() call before. Depending on 451 * aInclude the ending of the sub-string result includes or excludes the last 452 * parsed or checked token. 453 */ 454 void Claim(nsACString& aResult, ClaimInclusion aInclude = EXCLUDE_LAST); 455 void Claim(nsDependentCSubstring& aResult, 456 ClaimInclusion aInclude = EXCLUDE_LAST); 457 458 /** 459 * If aToken is found, aResult is set to the substring between the current 460 * position and the position of aToken, potentially including aToken depending 461 * on aInclude. 462 * If aToken isn't found aResult is set to the substring between the current 463 * position and the end of the string. 464 * If aToken is found, the method returns true. Otherwise it returns false. 465 * 466 * Calling Rollback() after ReadUntil() will return the read cursor to the 467 * position it had before ReadUntil was called. 468 */ 469 MOZ_MUST_USE bool ReadUntil(Token const& aToken, 470 nsDependentCSubstring& aResult, 471 ClaimInclusion aInclude = EXCLUDE_LAST); 472 MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsACString& aResult, 473 ClaimInclusion aInclude = EXCLUDE_LAST); 474 475 protected: 476 // All these point to the original buffer passed to the Tokenizer's 477 // constructor 478 nsACString::const_char_iterator 479 mRecord; // Position where the recorded sub-string for Claim() is 480 nsACString::const_char_iterator 481 mRollback; // Position of the previous token start 482 483 private: 484 Tokenizer() = delete; 485 Tokenizer(const Tokenizer&) = delete; 486 Tokenizer(Tokenizer&&) = delete; 487 Tokenizer(const Tokenizer&&) = delete; 488 Tokenizer& operator=(const Tokenizer&) = delete; 489 }; 490 491 } // namespace mozilla 492 493 #endif // Tokenizer_h__ 494