1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef Tokenizer_h__ 8 #define Tokenizer_h__ 9 10 #include "nsString.h" 11 #include "mozilla/CheckedInt.h" 12 #include "mozilla/UniquePtr.h" 13 #include "nsTArray.h" 14 15 namespace mozilla { 16 17 class TokenizerBase 18 { 19 public: 20 /** 21 * The analyzer works with elements in the input cut to a sequence of token 22 * where each token has an elementary type 23 */ 24 enum TokenType : uint32_t 25 { 26 TOKEN_UNKNOWN, 27 TOKEN_RAW, 28 TOKEN_ERROR, 29 TOKEN_INTEGER, 30 TOKEN_WORD, 31 TOKEN_CHAR, 32 TOKEN_WS, 33 TOKEN_EOL, 34 TOKEN_EOF, 35 TOKEN_CUSTOM0 = 1000 36 }; 37 38 enum ECaseSensitivity 39 { 40 CASE_SENSITIVE, 41 CASE_INSENSITIVE 42 }; 43 44 /** 45 * Class holding the type and the value of a token. It can be manually created 46 * to allow checks against it via methods of Tokenizer or are results of some of 47 * the Tokenizer's methods. 48 */ 49 class Token 50 { 51 TokenType mType; 52 nsDependentCSubstring mWord; 53 nsCString mCustom; 54 char mChar; 55 uint64_t mInteger; 56 ECaseSensitivity mCustomCaseInsensitivity; 57 bool mCustomEnabled; 58 59 // If this token is a result of the parsing process, this member is referencing 60 // a sub-string in the input buffer. If this is externally created Token this 61 // member is left an empty string. 62 nsDependentCSubstring mFragment; 63 64 friend class TokenizerBase; 65 void AssignFragment(nsACString::const_char_iterator begin, 66 nsACString::const_char_iterator end); 67 68 static Token Raw(); 69 70 public: 71 Token(); 72 Token(const Token& aOther); 73 Token& operator=(const Token& aOther); 74 75 // Static constructors of tokens by type and value 76 static Token Word(const nsACString& aWord); 77 static Token Char(const char aChar); 78 static Token Number(const uint64_t aNumber); 79 static Token Whitespace(); 80 static Token NewLine(); 81 static Token EndOfFile(); 82 static Token Error(); 83 84 // Compares the two tokens, type must be identical and value 85 // of one of the tokens must be 'any' or equal. 86 bool Equals(const Token& aOther) const; 87 Type()88 TokenType Type() const { return mType; } 89 char AsChar() const; 90 nsDependentCSubstring AsString() const; 91 uint64_t AsInteger() const; 92 Fragment()93 nsDependentCSubstring Fragment() const { return mFragment; } 94 }; 95 96 /** 97 * Consumers may register a custom string that, when found in the input, is considered 98 * a token and returned by Next*() and accepted by Check*() methods. 99 * AddCustomToken() returns a reference to a token that can then be comapred using 100 * Token::Equals() againts the output from Next*() or be passed to Check*(). 101 */ 102 Token AddCustomToken(const nsACString& aValue, ECaseSensitivity aCaseInsensitivity, bool aEnabled = true); 103 template <uint32_t N> 104 Token AddCustomToken(const char(&aValue)[N], ECaseSensitivity aCaseInsensitivity, bool aEnabled = true) 105 { 106 return AddCustomToken(nsDependentCSubstring(aValue, N - 1), aCaseInsensitivity, aEnabled); 107 } 108 void RemoveCustomToken(Token& aToken); 109 /** 110 * Only applies to a custom type of a Token (see AddCustomToken above.) 111 * This turns on and off token recognition. When a custom token is disabled, 112 * it's ignored as never added as a custom token. 113 */ 114 void EnableCustomToken(Token const& aToken, bool aEnable); 115 116 /** 117 * Mode of tokenization. 118 * FULL tokenization, the default, recognizes built-in tokens and any custom tokens, 119 * if added. 120 * CUSTOM_ONLY will only recognize custom tokens, the rest is seen as 'raw'. 121 * This mode can be understood as a 'binary' mode. 122 */ 123 enum class Mode 124 { 125 FULL, 126 CUSTOM_ONLY 127 }; 128 void SetTokenizingMode(Mode aMode); 129 130 /** 131 * Return false iff the last Check*() call has returned false or when we've read past 132 * the end of the input string. 133 */ 134 MOZ_MUST_USE bool HasFailed() const; 135 136 protected: 137 explicit TokenizerBase(const char* aWhitespaces = nullptr, 138 const char* aAdditionalWordChars = nullptr); 139 140 // false if we have already read the EOF token. 141 bool HasInput() const; 142 // Main parsing function, it doesn't shift the read cursor, just returns the next 143 // token position. 144 nsACString::const_char_iterator Parse(Token& aToken) const; 145 // Is read cursor at the end? 146 bool IsEnd(const nsACString::const_char_iterator& caret) const; 147 // True, when we are at the end of the input data, but it has not been marked 148 // as complete yet. In that case we cannot proceed with providing a multi-char token. 149 bool IsPending(const nsACString::const_char_iterator & caret) const; 150 // Is read cursor on a character that is a word start? 151 bool IsWordFirst(const char aInput) const; 152 // Is read cursor on a character that is an in-word letter? 153 bool IsWord(const char aInput) const; 154 // Is read cursor on a character that is a valid number? 155 // TODO - support multiple radix 156 bool IsNumber(const char aInput) const; 157 // Is equal to the given custom token? 158 bool IsCustom(const nsACString::const_char_iterator& caret, 159 const Token& aCustomToken, uint32_t* aLongest = nullptr) const; 160 161 // Friendly helper to assign a fragment on a Token 162 static void AssignFragment(Token& aToken, 163 nsACString::const_char_iterator begin, 164 nsACString::const_char_iterator end); 165 166 // true iff we have already read the EOF token 167 bool mPastEof; 168 // true iff the last Check*() call has returned false, reverts to true on Rollback() call 169 bool mHasFailed; 170 // true if the input string is final (finished), false when we expect more data 171 // yet to be fed to the tokenizer (see IncrementalTokenizer derived class). 172 bool mInputFinished; 173 // custom only vs full tokenizing mode, see the Parse() method 174 Mode mMode; 175 // minimal raw data chunked delivery during incremental feed 176 uint32_t mMinRawDelivery; 177 178 // Customizable list of whitespaces 179 const char* mWhitespaces; 180 // Additinal custom word characters 181 const char* mAdditionalWordChars; 182 183 // All these point to the original buffer passed to the constructor or to the incremental 184 // buffer after FeedInput. 185 nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start 186 nsACString::const_char_iterator mEnd; // End of the input position 187 188 // This is the list of tokens user has registered with AddCustomToken() 189 nsTArray<UniquePtr<Token>> mCustomTokens; 190 uint32_t mNextCustomTokenID; 191 192 private: 193 TokenizerBase() = delete; 194 TokenizerBase(const TokenizerBase&) = delete; 195 TokenizerBase(TokenizerBase&&) = delete; 196 TokenizerBase(const TokenizerBase&&) = delete; 197 TokenizerBase &operator=(const TokenizerBase&) = delete; 198 }; 199 200 /** 201 * This is a simple implementation of a lexical analyzer or maybe better 202 * called a tokenizer. It doesn't allow any user dictionaries or 203 * user define token types. 204 * 205 * It is limited only to ASCII input for now. UTF-8 or any other input 206 * encoding must yet be implemented. 207 */ 208 class Tokenizer : public TokenizerBase 209 { 210 public: 211 /** 212 * @param aSource 213 * The string to parse. 214 * IMPORTANT NOTE: Tokenizer doesn't ensure the input string buffer lifetime. 215 * It's up to the consumer to make sure the string's buffer outlives the Tokenizer! 216 * @param aWhitespaces 217 * If non-null Tokenizer will use this custom set of whitespaces for CheckWhite() 218 * and SkipWhites() calls. 219 * By default the list consists of space and tab. 220 * @param aAdditionalWordChars 221 * If non-null it will be added to the list of characters that consist a word. 222 * This is useful when you want to accept e.g. '-' in HTTP headers. 223 * By default a word character is consider any character for which upper case 224 * is different from lower case. 225 * 226 * If there is an overlap between aWhitespaces and aAdditionalWordChars, the check for 227 * word characters is made first. 228 */ 229 explicit Tokenizer(const nsACString& aSource, 230 const char* aWhitespaces = nullptr, 231 const char* aAdditionalWordChars = nullptr); 232 explicit Tokenizer(const char* aSource, 233 const char* aWhitespaces = nullptr, 234 const char* aAdditionalWordChars = nullptr); 235 236 /** 237 * When there is still anything to read from the input, tokenize it, store the token type 238 * and value to aToken result and shift the cursor past this just parsed token. Each call 239 * to Next() reads another token from the input and shifts the cursor. 240 * Returns false if we have passed the end of the input. 241 */ 242 MOZ_MUST_USE 243 bool Next(Token& aToken); 244 245 /** 246 * Parse the token on the input read cursor position, check its type is equal to aTokenType 247 * and if so, put it into aResult, shift the cursor and return true. Otherwise, leave 248 * the input read cursor position intact and return false. 249 */ 250 MOZ_MUST_USE 251 bool Check(const TokenType aTokenType, Token& aResult); 252 /** 253 * Same as above method, just compares both token type and token value passed in aToken. 254 * When both the type and the value equals, shift the cursor and return true. Otherwise 255 * return false. 256 */ 257 MOZ_MUST_USE 258 bool Check(const Token& aToken); 259 260 /** 261 * SkipWhites method (below) may also skip new line characters automatically. 262 */ 263 enum WhiteSkipping { 264 /** 265 * SkipWhites will only skip what is defined as a white space (default). 266 */ 267 DONT_INCLUDE_NEW_LINE = 0, 268 /** 269 * SkipWhites will skip definited white spaces as well as new lines 270 * automatically. 271 */ 272 INCLUDE_NEW_LINE = 1 273 }; 274 275 /** 276 * Skips any occurence of whitespaces specified in mWhitespaces member, 277 * optionally skip also new lines. 278 */ 279 void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE); 280 281 /** 282 * Skips all tokens until the given one is found or EOF is hit. The token 283 * or EOF are next to read. 284 */ 285 void SkipUntil(Token const& aToken); 286 287 // These are mostly shortcuts for the Check() methods above. 288 289 /** 290 * Check whitespace character is present. 291 */ 292 MOZ_MUST_USE CheckWhite()293 bool CheckWhite() { return Check(Token::Whitespace()); } 294 /** 295 * Check there is a single character on the read cursor position. If so, shift the read 296 * cursor position and return true. Otherwise false. 297 */ 298 MOZ_MUST_USE CheckChar(const char aChar)299 bool CheckChar(const char aChar) { return Check(Token::Char(aChar)); } 300 /** 301 * This is a customizable version of CheckChar. aClassifier is a function called with 302 * value of the character on the current input read position. If this user function 303 * returns true, read cursor is shifted and true returned. Otherwise false. 304 * The user classifiction function is not called when we are at or past the end and 305 * false is immediately returned. 306 */ 307 MOZ_MUST_USE 308 bool CheckChar(bool (*aClassifier)(const char aChar)); 309 /** 310 * Check for a whole expected word. 311 */ 312 MOZ_MUST_USE CheckWord(const nsACString & aWord)313 bool CheckWord(const nsACString& aWord) { return Check(Token::Word(aWord)); } 314 /** 315 * Shortcut for literal const word check with compile time length calculation. 316 */ 317 template <uint32_t N> 318 MOZ_MUST_USE CheckWord(const char (& aWord)[N])319 bool CheckWord(const char (&aWord)[N]) { return Check(Token::Word(nsDependentCString(aWord, N - 1))); } 320 /** 321 * Checks \r, \n or \r\n. 322 */ 323 MOZ_MUST_USE CheckEOL()324 bool CheckEOL() { return Check(Token::NewLine()); } 325 /** 326 * Checks we are at the end of the input string reading. If so, shift past the end 327 * and returns true. Otherwise does nothing and returns false. 328 */ 329 MOZ_MUST_USE CheckEOF()330 bool CheckEOF() { return Check(Token::EndOfFile()); } 331 332 /** 333 * These are shortcuts to obtain the value immediately when the token type matches. 334 */ 335 MOZ_MUST_USE bool ReadChar(char* aValue); 336 MOZ_MUST_USE bool ReadChar(bool (*aClassifier)(const char aChar), 337 char* aValue); 338 MOZ_MUST_USE bool ReadWord(nsACString& aValue); 339 MOZ_MUST_USE bool ReadWord(nsDependentCSubstring& aValue); 340 341 /** 342 * This is an integer read helper. It returns false and doesn't move the read 343 * cursor when any of the following happens: 344 * - the token at the read cursor is not an integer 345 * - the final number doesn't fit the T type 346 * Otherwise true is returned, aValue is filled with the integral number 347 * and the cursor is moved forward. 348 */ 349 template <typename T> ReadInteger(T * aValue)350 MOZ_MUST_USE bool ReadInteger(T* aValue) 351 { 352 MOZ_RELEASE_ASSERT(aValue); 353 354 nsACString::const_char_iterator rollback = mRollback; 355 nsACString::const_char_iterator cursor = mCursor; 356 Token t; 357 if (!Check(TOKEN_INTEGER, t)) { 358 return false; 359 } 360 361 mozilla::CheckedInt<T> checked(t.AsInteger()); 362 if (!checked.isValid()) { 363 // Move to a state as if Check() call has failed 364 mRollback = rollback; 365 mCursor = cursor; 366 mHasFailed = true; 367 return false; 368 } 369 370 *aValue = checked.value(); 371 return true; 372 } 373 374 /** 375 * Returns the read cursor position back as it was before the last call of any parsing 376 * method of Tokenizer (Next, Check*, Skip*, Read*) so that the last operation 377 * can be repeated. 378 * Rollback cannot be used multiple times, it only reverts the last successfull parse 379 * operation. It also cannot be used before any parsing operation has been called 380 * on the Tokenizer. 381 */ 382 void Rollback(); 383 384 /** 385 * Record() and Claim() are collecting the input as it is being parsed to obtain 386 * a substring between particular syntax bounderies defined by any recursive 387 * descent parser or simple parser the Tokenizer is used to read the input for. 388 * Inlucsion of a token that has just been parsed can be controlled using an arguemnt. 389 */ 390 enum ClaimInclusion { 391 /** 392 * Include resulting (or passed) token of the last lexical analyzer operation in the result. 393 */ 394 INCLUDE_LAST, 395 /** 396 * Do not include it. 397 */ 398 EXCLUDE_LAST 399 }; 400 401 /** 402 * Start the process of recording. Based on aInclude value the begining of the recorded 403 * sub-string is at the current position (EXCLUDE_LAST) or at the position before the last 404 * parsed token (INCLUDE_LAST). 405 */ 406 void Record(ClaimInclusion aInclude = EXCLUDE_LAST); 407 /** 408 * Claim result of the record started with Record() call before. Depending on aInclude 409 * the ending of the sub-string result includes or excludes the last parsed or checked 410 * token. 411 */ 412 void Claim(nsACString& aResult, ClaimInclusion aInclude = EXCLUDE_LAST); 413 void Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclude = EXCLUDE_LAST); 414 415 /** 416 * If aToken is found, aResult is set to the substring between the current 417 * position and the position of aToken, potentially including aToken depending 418 * on aInclude. 419 * If aToken isn't found aResult is set to the substring between the current 420 * position and the end of the string. 421 * If aToken is found, the method returns true. Otherwise it returns false. 422 * 423 * Calling Rollback() after ReadUntil() will return the read cursor to the 424 * position it had before ReadUntil was called. 425 */ 426 MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsDependentCSubstring& aResult, 427 ClaimInclusion aInclude = EXCLUDE_LAST); 428 MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsACString& aResult, 429 ClaimInclusion aInclude = EXCLUDE_LAST); 430 431 protected: 432 // All these point to the original buffer passed to the Tokenizer's constructor 433 nsACString::const_char_iterator mRecord; // Position where the recorded sub-string for Claim() is 434 nsACString::const_char_iterator mRollback; // Position of the previous token start 435 436 private: 437 Tokenizer() = delete; 438 Tokenizer(const Tokenizer&) = delete; 439 Tokenizer(Tokenizer&&) = delete; 440 Tokenizer(const Tokenizer&&) = delete; 441 Tokenizer &operator=(const Tokenizer&) = delete; 442 }; 443 444 } // mozilla 445 446 #endif // Tokenizer_h__ 447