1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef Tokenizer_h__ 8 #define Tokenizer_h__ 9 10 #include <type_traits> 11 12 #include "nsString.h" 13 #include "mozilla/CheckedInt.h" 14 #include "mozilla/ScopeExit.h" 15 #include "mozilla/UniquePtr.h" 16 #include "nsTArray.h" 17 18 namespace mozilla { 19 20 template <typename TChar> 21 class TokenizerBase { 22 public: 23 typedef nsTSubstring<TChar> TAString; 24 typedef nsTString<TChar> TString; 25 typedef nsTDependentString<TChar> TDependentString; 26 typedef nsTDependentSubstring<TChar> TDependentSubstring; 27 28 static TChar const sWhitespaces[]; 29 30 /** 31 * The analyzer works with elements in the input cut to a sequence of token 32 * where each token has an elementary type 33 */ 34 enum TokenType : uint32_t { 35 TOKEN_UNKNOWN, 36 TOKEN_RAW, 37 TOKEN_ERROR, 38 TOKEN_INTEGER, 39 TOKEN_WORD, 40 TOKEN_CHAR, 41 TOKEN_WS, 42 TOKEN_EOL, 43 TOKEN_EOF, 44 TOKEN_CUSTOM0 = 1000 45 }; 46 47 enum ECaseSensitivity { CASE_SENSITIVE, CASE_INSENSITIVE }; 48 49 /** 50 * Class holding the type and the value of a token. It can be manually 51 * created to allow checks against it via methods of TTokenizer or are results 52 * of some of the TTokenizer's methods. 53 */ 54 class Token { 55 TokenType mType; 56 TDependentSubstring mWord; 57 TString mCustom; 58 TChar mChar; 59 uint64_t mInteger; 60 ECaseSensitivity mCustomCaseInsensitivity; 61 bool mCustomEnabled; 62 63 // If this token is a result of the parsing process, this member is 64 // referencing a sub-string in the input buffer. If this is externally 65 // created Token this member is left an empty string. 66 TDependentSubstring mFragment; 67 68 friend class TokenizerBase<TChar>; 69 void AssignFragment(typename TAString::const_char_iterator begin, 70 typename TAString::const_char_iterator end); 71 72 static Token Raw(); 73 74 public: 75 Token(); 76 Token(const Token& aOther); 77 Token& operator=(const Token& aOther); 78 79 // Static constructors of tokens by type and value 80 static Token Word(TAString const& aWord); 81 static Token Char(TChar const aChar); 82 static Token Number(uint64_t const aNumber); 83 static Token Whitespace(); 84 static Token NewLine(); 85 static Token EndOfFile(); 86 static Token Error(); 87 88 // Compares the two tokens, type must be identical and value 89 // of one of the tokens must be 'any' or equal. 90 bool Equals(const Token& aOther) const; 91 Type()92 TokenType Type() const { return mType; } 93 TChar AsChar() const; 94 TDependentSubstring AsString() const; 95 uint64_t AsInteger() const; 96 Fragment()97 TDependentSubstring Fragment() const { return mFragment; } 98 }; 99 100 /** 101 * Consumers may register a custom string that, when found in the input, is 102 * considered a token and returned by Next*() and accepted by Check*() 103 * methods. AddCustomToken() returns a reference to a token that can then be 104 * comapred using Token::Equals() againts the output from Next*() or be passed 105 * to Check*(). 106 */ 107 Token AddCustomToken(const TAString& aValue, 108 ECaseSensitivity aCaseInsensitivity, 109 bool aEnabled = true); 110 template <uint32_t N> 111 Token AddCustomToken(const TChar (&aValue)[N], 112 ECaseSensitivity aCaseInsensitivity, 113 bool aEnabled = true) { 114 return AddCustomToken(TDependentSubstring(aValue, N - 1), 115 aCaseInsensitivity, aEnabled); 116 } 117 void RemoveCustomToken(Token& aToken); 118 /** 119 * Only applies to a custom type of a Token (see AddCustomToken above.) 120 * This turns on and off token recognition. When a custom token is disabled, 121 * it's ignored as never added as a custom token. 122 */ 123 void EnableCustomToken(Token const& aToken, bool aEnable); 124 125 /** 126 * Mode of tokenization. 127 * FULL tokenization, the default, recognizes built-in tokens and any custom 128 * tokens, if added. CUSTOM_ONLY will only recognize custom tokens, the rest 129 * is seen as 'raw'. This mode can be understood as a 'binary' mode. 130 */ 131 enum class Mode { FULL, CUSTOM_ONLY }; 132 void SetTokenizingMode(Mode aMode); 133 134 /** 135 * Return false iff the last Check*() call has returned false or when we've 136 * read past the end of the input string. 137 */ 138 [[nodiscard]] bool HasFailed() const; 139 140 protected: 141 explicit TokenizerBase(const TChar* aWhitespaces = nullptr, 142 const TChar* aAdditionalWordChars = nullptr); 143 144 // false if we have already read the EOF token. 145 bool HasInput() const; 146 // Main parsing function, it doesn't shift the read cursor, just returns the 147 // next token position. 148 typename TAString::const_char_iterator Parse(Token& aToken) const; 149 // Is read cursor at the end? 150 bool IsEnd(const typename TAString::const_char_iterator& caret) const; 151 // True, when we are at the end of the input data, but it has not been marked 152 // as complete yet. In that case we cannot proceed with providing a 153 // multi-TChar token. 154 bool IsPending(const typename TAString::const_char_iterator& caret) const; 155 // Is read cursor on a character that is a word start? 156 bool IsWordFirst(const TChar aInput) const; 157 // Is read cursor on a character that is an in-word letter? 158 bool IsWord(const TChar aInput) const; 159 // Is read cursor on a character that is a valid number? 160 // TODO - support multiple radix 161 bool IsNumber(const TChar aInput) const; 162 // Is equal to the given custom token? 163 bool IsCustom(const typename TAString::const_char_iterator& caret, 164 const Token& aCustomToken, uint32_t* aLongest = nullptr) const; 165 166 // Friendly helper to assign a fragment on a Token 167 static void AssignFragment(Token& aToken, 168 typename TAString::const_char_iterator begin, 169 typename TAString::const_char_iterator end); 170 171 #ifdef DEBUG 172 // This is called from inside Tokenizer methods to make sure the token is 173 // valid. 174 void Validate(Token const& aToken); 175 #endif 176 177 // true iff we have already read the EOF token 178 bool mPastEof; 179 // true iff the last Check*() call has returned false, reverts to true on 180 // Rollback() call 181 bool mHasFailed; 182 // true if the input string is final (finished), false when we expect more 183 // data yet to be fed to the tokenizer (see IncrementalTokenizer derived 184 // class). 185 bool mInputFinished; 186 // custom only vs full tokenizing mode, see the Parse() method 187 Mode mMode; 188 // minimal raw data chunked delivery during incremental feed 189 uint32_t mMinRawDelivery; 190 191 // Customizable list of whitespaces 192 const TChar* mWhitespaces; 193 // Additinal custom word characters 194 const TChar* mAdditionalWordChars; 195 196 // All these point to the original buffer passed to the constructor or to the 197 // incremental buffer after FeedInput. 198 typename TAString::const_char_iterator 199 mCursor; // Position of the current (actually next to read) token start 200 typename TAString::const_char_iterator mEnd; // End of the input position 201 202 // This is the list of tokens user has registered with AddCustomToken() 203 nsTArray<UniquePtr<Token>> mCustomTokens; 204 uint32_t mNextCustomTokenID; 205 206 private: 207 TokenizerBase() = delete; 208 TokenizerBase(const TokenizerBase&) = delete; 209 TokenizerBase(TokenizerBase&&) = delete; 210 TokenizerBase(const TokenizerBase&&) = delete; 211 TokenizerBase& operator=(const TokenizerBase&) = delete; 212 }; 213 214 /** 215 * This is a simple implementation of a lexical analyzer or maybe better 216 * called a tokenizer. 217 * 218 * Please use Tokenizer or Tokenizer16 classes, that are specializations 219 * of this template class. Tokenizer is for ASCII input, Tokenizer16 may 220 * handle char16_t input, but doesn't recognize whitespaces or numbers 221 * other than standard `char` specialized Tokenizer class. 222 */ 223 template <typename TChar> 224 class TTokenizer : public TokenizerBase<TChar> { 225 public: 226 typedef TokenizerBase<TChar> base; 227 228 /** 229 * @param aSource 230 * The string to parse. 231 * IMPORTANT NOTE: TTokenizer doesn't ensure the input string buffer 232 * lifetime. It's up to the consumer to make sure the string's buffer outlives 233 * the TTokenizer! 234 * @param aWhitespaces 235 * If non-null TTokenizer will use this custom set of whitespaces for 236 * CheckWhite() and SkipWhites() calls. By default the list consists of space 237 * and tab. 238 * @param aAdditionalWordChars 239 * If non-null it will be added to the list of characters that consist a 240 * word. This is useful when you want to accept e.g. '-' in HTTP headers. By 241 * default a word character is consider any character for which upper case 242 * is different from lower case. 243 * 244 * If there is an overlap between aWhitespaces and aAdditionalWordChars, the 245 * check for word characters is made first. 246 */ 247 explicit TTokenizer(const typename base::TAString& aSource, 248 const TChar* aWhitespaces = nullptr, 249 const TChar* aAdditionalWordChars = nullptr); 250 explicit TTokenizer(const TChar* aSource, const TChar* aWhitespaces = nullptr, 251 const TChar* aAdditionalWordChars = nullptr); 252 253 /** 254 * When there is still anything to read from the input, tokenize it, store the 255 * token type and value to aToken result and shift the cursor past this just 256 * parsed token. Each call to Next() reads another token from the input and 257 * shifts the cursor. Returns false if we have passed the end of the input. 258 */ 259 [[nodiscard]] bool Next(typename base::Token& aToken); 260 261 /** 262 * Parse the token on the input read cursor position, check its type is equal 263 * to aTokenType and if so, put it into aResult, shift the cursor and return 264 * true. Otherwise, leave the input read cursor position intact and return 265 * false. 266 */ 267 [[nodiscard]] bool Check(const typename base::TokenType aTokenType, 268 typename base::Token& aResult); 269 /** 270 * Same as above method, just compares both token type and token value passed 271 * in aToken. When both the type and the value equals, shift the cursor and 272 * return true. Otherwise return false. 273 */ 274 [[nodiscard]] bool Check(const typename base::Token& aToken); 275 276 /** 277 * SkipWhites method (below) may also skip new line characters automatically. 278 */ 279 enum WhiteSkipping { 280 /** 281 * SkipWhites will only skip what is defined as a white space (default). 282 */ 283 DONT_INCLUDE_NEW_LINE = 0, 284 /** 285 * SkipWhites will skip definited white spaces as well as new lines 286 * automatically. 287 */ 288 INCLUDE_NEW_LINE = 1 289 }; 290 291 /** 292 * Skips any occurence of whitespaces specified in mWhitespaces member, 293 * optionally skip also new lines. 294 */ 295 void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE); 296 297 /** 298 * Skips all tokens until the given one is found or EOF is hit. The token 299 * or EOF are next to read. 300 */ 301 void SkipUntil(typename base::Token const& aToken); 302 303 // These are mostly shortcuts for the Check() methods above. 304 305 /** 306 * Check whitespace character is present. 307 */ CheckWhite()308 [[nodiscard]] bool CheckWhite() { return Check(base::Token::Whitespace()); } 309 /** 310 * Check there is a single character on the read cursor position. If so, 311 * shift the read cursor position and return true. Otherwise false. 312 */ CheckChar(const TChar aChar)313 [[nodiscard]] bool CheckChar(const TChar aChar) { 314 return Check(base::Token::Char(aChar)); 315 } 316 /** 317 * This is a customizable version of CheckChar. aClassifier is a function 318 * called with value of the character on the current input read position. If 319 * this user function returns true, read cursor is shifted and true returned. 320 * Otherwise false. The user classifiction function is not called when we are 321 * at or past the end and false is immediately returned. 322 */ 323 [[nodiscard]] bool CheckChar(bool (*aClassifier)(const TChar aChar)); 324 /** 325 * Check for a whole expected word. 326 */ CheckWord(const typename base::TAString & aWord)327 [[nodiscard]] bool CheckWord(const typename base::TAString& aWord) { 328 return Check(base::Token::Word(aWord)); 329 } 330 /** 331 * Shortcut for literal const word check with compile time length calculation. 332 */ 333 template <uint32_t N> CheckWord(const TChar (& aWord)[N])334 [[nodiscard]] bool CheckWord(const TChar (&aWord)[N]) { 335 return Check( 336 base::Token::Word(typename base::TDependentString(aWord, N - 1))); 337 } 338 /** 339 * Helper to check for a string compound of multiple tokens like "foo bar". 340 * The match is binary-exact, a white space or a delimiter character in the 341 * phrase must match exactly the characters in the input. 342 */ 343 [[nodiscard]] bool CheckPhrase(const typename base::TAString& aPhrase); 344 template <uint32_t N> CheckPhrase(const TChar (& aPhrase)[N])345 [[nodiscard]] bool CheckPhrase(const TChar (&aPhrase)[N]) { 346 return CheckPhrase(typename base::TDependentString(aPhrase, N - 1)); 347 } 348 /** 349 * Checks \r, \n or \r\n. 350 */ CheckEOL()351 [[nodiscard]] bool CheckEOL() { return Check(base::Token::NewLine()); } 352 /** 353 * Checks we are at the end of the input string reading. If so, shift past 354 * the end and returns true. Otherwise does nothing and returns false. 355 */ CheckEOF()356 [[nodiscard]] bool CheckEOF() { return Check(base::Token::EndOfFile()); } 357 358 /** 359 * These are shortcuts to obtain the value immediately when the token type 360 * matches. 361 */ 362 [[nodiscard]] bool ReadChar(TChar* aValue); 363 [[nodiscard]] bool ReadChar(bool (*aClassifier)(const TChar aChar), 364 TChar* aValue); 365 [[nodiscard]] bool ReadWord(typename base::TAString& aValue); 366 [[nodiscard]] bool ReadWord(typename base::TDependentSubstring& aValue); 367 368 /** 369 * This is an integer read helper. It returns false and doesn't move the read 370 * cursor when any of the following happens: 371 * - the token at the read cursor is not an integer 372 * - the final number doesn't fit the T type 373 * Otherwise true is returned, aValue is filled with the integral number 374 * and the cursor is moved forward. 375 */ 376 template <typename T> ReadInteger(T * aValue)377 [[nodiscard]] bool ReadInteger(T* aValue) { 378 MOZ_RELEASE_ASSERT(aValue); 379 380 typename base::TAString::const_char_iterator rollback = mRollback; 381 typename base::TAString::const_char_iterator cursor = base::mCursor; 382 typename base::Token t; 383 if (!Check(base::TOKEN_INTEGER, t)) { 384 return false; 385 } 386 387 mozilla::CheckedInt<T> checked(t.AsInteger()); 388 if (!checked.isValid()) { 389 // Move to a state as if Check() call has failed 390 mRollback = rollback; 391 base::mCursor = cursor; 392 base::mHasFailed = true; 393 return false; 394 } 395 396 *aValue = checked.value(); 397 return true; 398 } 399 400 /** 401 * Same as above, but accepts an integer with an optional minus sign. 402 */ 403 template <typename T, typename V = std::enable_if_t< 404 std::is_signed_v<std::remove_pointer_t<T>>, 405 std::remove_pointer_t<T>>> ReadSignedInteger(T * aValue)406 [[nodiscard]] bool ReadSignedInteger(T* aValue) { 407 MOZ_RELEASE_ASSERT(aValue); 408 409 typename base::TAString::const_char_iterator rollback = mRollback; 410 typename base::TAString::const_char_iterator cursor = base::mCursor; 411 auto revert = MakeScopeExit([&] { 412 // Move to a state as if Check() call has failed 413 mRollback = rollback; 414 base::mCursor = cursor; 415 base::mHasFailed = true; 416 }); 417 418 // Using functional raw access because '-' could be part of the word set 419 // making CheckChar('-') not work. 420 bool minus = CheckChar([](const TChar aChar) { return aChar == '-'; }); 421 422 typename base::Token t; 423 if (!Check(base::TOKEN_INTEGER, t)) { 424 return false; 425 } 426 427 mozilla::CheckedInt<T> checked(t.AsInteger()); 428 if (minus) { 429 checked *= -1; 430 } 431 432 if (!checked.isValid()) { 433 return false; 434 } 435 436 *aValue = checked.value(); 437 revert.release(); 438 return true; 439 } 440 441 /** 442 * Returns the read cursor position back as it was before the last call of any 443 * parsing method of TTokenizer (Next, Check*, Skip*, Read*) so that the last 444 * operation can be repeated. Rollback cannot be used multiple times, it only 445 * reverts the last successfull parse operation. It also cannot be used 446 * before any parsing operation has been called on the TTokenizer. 447 */ 448 void Rollback(); 449 450 /** 451 * Record() and Claim() are collecting the input as it is being parsed to 452 * obtain a substring between particular syntax bounderies defined by any 453 * recursive descent parser or simple parser the TTokenizer is used to read 454 * the input for. Inlucsion of a token that has just been parsed can be 455 * controlled using an arguemnt. 456 */ 457 enum ClaimInclusion { 458 /** 459 * Include resulting (or passed) token of the last lexical analyzer 460 * operation in the result. 461 */ 462 INCLUDE_LAST, 463 /** 464 * Do not include it. 465 */ 466 EXCLUDE_LAST 467 }; 468 469 /** 470 * Start the process of recording. Based on aInclude value the begining of 471 * the recorded sub-string is at the current position (EXCLUDE_LAST) or at the 472 * position before the last parsed token (INCLUDE_LAST). 473 */ 474 void Record(ClaimInclusion aInclude = EXCLUDE_LAST); 475 /** 476 * Claim result of the record started with Record() call before. Depending on 477 * aInclude the ending of the sub-string result includes or excludes the last 478 * parsed or checked token. 479 */ 480 void Claim(typename base::TAString& aResult, 481 ClaimInclusion aInclude = EXCLUDE_LAST); 482 void Claim(typename base::TDependentSubstring& aResult, 483 ClaimInclusion aInclude = EXCLUDE_LAST); 484 485 /** 486 * If aToken is found, aResult is set to the substring between the current 487 * position and the position of aToken, potentially including aToken depending 488 * on aInclude. 489 * If aToken isn't found aResult is set to the substring between the current 490 * position and the end of the string. 491 * If aToken is found, the method returns true. Otherwise it returns false. 492 * 493 * Calling Rollback() after ReadUntil() will return the read cursor to the 494 * position it had before ReadUntil was called. 495 */ 496 [[nodiscard]] bool ReadUntil(typename base::Token const& aToken, 497 typename base::TDependentSubstring& aResult, 498 ClaimInclusion aInclude = EXCLUDE_LAST); 499 [[nodiscard]] bool ReadUntil(typename base::Token const& aToken, 500 typename base::TAString& aResult, 501 ClaimInclusion aInclude = EXCLUDE_LAST); 502 503 protected: 504 // All these point to the original buffer passed to the TTokenizer's 505 // constructor 506 typename base::TAString::const_char_iterator 507 mRecord; // Position where the recorded sub-string for Claim() is 508 typename base::TAString::const_char_iterator 509 mRollback; // Position of the previous token start 510 511 private: 512 TTokenizer() = delete; 513 TTokenizer(const TTokenizer&) = delete; 514 TTokenizer(TTokenizer&&) = delete; 515 TTokenizer(const TTokenizer&&) = delete; 516 TTokenizer& operator=(const TTokenizer&) = delete; 517 }; 518 519 typedef TTokenizer<char> Tokenizer; 520 typedef TTokenizer<char16_t> Tokenizer16; 521 522 } // namespace mozilla 523 524 #endif // Tokenizer_h__ 525