1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #ifndef Tokenizer_h__
8 #define Tokenizer_h__
9 
10 #include <type_traits>
11 
12 #include "nsString.h"
13 #include "mozilla/CheckedInt.h"
14 #include "mozilla/ScopeExit.h"
15 #include "mozilla/UniquePtr.h"
16 #include "nsTArray.h"
17 
18 namespace mozilla {
19 
20 template <typename TChar>
21 class TokenizerBase {
22  public:
23   typedef nsTSubstring<TChar> TAString;
24   typedef nsTString<TChar> TString;
25   typedef nsTDependentString<TChar> TDependentString;
26   typedef nsTDependentSubstring<TChar> TDependentSubstring;
27 
28   static TChar const sWhitespaces[];
29 
30   /**
31    * The analyzer works with elements in the input cut to a sequence of token
32    * where each token has an elementary type
33    */
34   enum TokenType : uint32_t {
35     TOKEN_UNKNOWN,
36     TOKEN_RAW,
37     TOKEN_ERROR,
38     TOKEN_INTEGER,
39     TOKEN_WORD,
40     TOKEN_CHAR,
41     TOKEN_WS,
42     TOKEN_EOL,
43     TOKEN_EOF,
44     TOKEN_CUSTOM0 = 1000
45   };
46 
47   enum ECaseSensitivity { CASE_SENSITIVE, CASE_INSENSITIVE };
48 
49   /**
50    * Class holding the type and the value of a token.  It can be manually
51    * created to allow checks against it via methods of TTokenizer or are results
52    * of some of the TTokenizer's methods.
53    */
54   class Token {
55     TokenType mType;
56     TDependentSubstring mWord;
57     TString mCustom;
58     TChar mChar;
59     uint64_t mInteger;
60     ECaseSensitivity mCustomCaseInsensitivity;
61     bool mCustomEnabled;
62 
63     // If this token is a result of the parsing process, this member is
64     // referencing a sub-string in the input buffer.  If this is externally
65     // created Token this member is left an empty string.
66     TDependentSubstring mFragment;
67 
68     friend class TokenizerBase<TChar>;
69     void AssignFragment(typename TAString::const_char_iterator begin,
70                         typename TAString::const_char_iterator end);
71 
72     static Token Raw();
73 
74    public:
75     Token();
76     Token(const Token& aOther);
77     Token& operator=(const Token& aOther);
78 
79     // Static constructors of tokens by type and value
80     static Token Word(TAString const& aWord);
81     static Token Char(TChar const aChar);
82     static Token Number(uint64_t const aNumber);
83     static Token Whitespace();
84     static Token NewLine();
85     static Token EndOfFile();
86     static Token Error();
87 
88     // Compares the two tokens, type must be identical and value
89     // of one of the tokens must be 'any' or equal.
90     bool Equals(const Token& aOther) const;
91 
Type()92     TokenType Type() const { return mType; }
93     TChar AsChar() const;
94     TDependentSubstring AsString() const;
95     uint64_t AsInteger() const;
96 
Fragment()97     TDependentSubstring Fragment() const { return mFragment; }
98   };
99 
100   /**
101    * Consumers may register a custom string that, when found in the input, is
102    * considered a token and returned by Next*() and accepted by Check*()
103    * methods. AddCustomToken() returns a reference to a token that can then be
104    * comapred using Token::Equals() againts the output from Next*() or be passed
105    * to Check*().
106    */
107   Token AddCustomToken(const TAString& aValue,
108                        ECaseSensitivity aCaseInsensitivity,
109                        bool aEnabled = true);
110   template <uint32_t N>
111   Token AddCustomToken(const TChar (&aValue)[N],
112                        ECaseSensitivity aCaseInsensitivity,
113                        bool aEnabled = true) {
114     return AddCustomToken(TDependentSubstring(aValue, N - 1),
115                           aCaseInsensitivity, aEnabled);
116   }
117   void RemoveCustomToken(Token& aToken);
118   /**
119    * Only applies to a custom type of a Token (see AddCustomToken above.)
120    * This turns on and off token recognition.  When a custom token is disabled,
121    * it's ignored as never added as a custom token.
122    */
123   void EnableCustomToken(Token const& aToken, bool aEnable);
124 
125   /**
126    * Mode of tokenization.
127    * FULL tokenization, the default, recognizes built-in tokens and any custom
128    * tokens, if added. CUSTOM_ONLY will only recognize custom tokens, the rest
129    * is seen as 'raw'. This mode can be understood as a 'binary' mode.
130    */
131   enum class Mode { FULL, CUSTOM_ONLY };
132   void SetTokenizingMode(Mode aMode);
133 
134   /**
135    * Return false iff the last Check*() call has returned false or when we've
136    * read past the end of the input string.
137    */
138   [[nodiscard]] bool HasFailed() const;
139 
140  protected:
141   explicit TokenizerBase(const TChar* aWhitespaces = nullptr,
142                          const TChar* aAdditionalWordChars = nullptr);
143 
144   // false if we have already read the EOF token.
145   bool HasInput() const;
146   // Main parsing function, it doesn't shift the read cursor, just returns the
147   // next token position.
148   typename TAString::const_char_iterator Parse(Token& aToken) const;
149   // Is read cursor at the end?
150   bool IsEnd(const typename TAString::const_char_iterator& caret) const;
151   // True, when we are at the end of the input data, but it has not been marked
152   // as complete yet.  In that case we cannot proceed with providing a
153   // multi-TChar token.
154   bool IsPending(const typename TAString::const_char_iterator& caret) const;
155   // Is read cursor on a character that is a word start?
156   bool IsWordFirst(const TChar aInput) const;
157   // Is read cursor on a character that is an in-word letter?
158   bool IsWord(const TChar aInput) const;
159   // Is read cursor on a character that is a valid number?
160   // TODO - support multiple radix
161   bool IsNumber(const TChar aInput) const;
162   // Is equal to the given custom token?
163   bool IsCustom(const typename TAString::const_char_iterator& caret,
164                 const Token& aCustomToken, uint32_t* aLongest = nullptr) const;
165 
166   // Friendly helper to assign a fragment on a Token
167   static void AssignFragment(Token& aToken,
168                              typename TAString::const_char_iterator begin,
169                              typename TAString::const_char_iterator end);
170 
171 #ifdef DEBUG
172   // This is called from inside Tokenizer methods to make sure the token is
173   // valid.
174   void Validate(Token const& aToken);
175 #endif
176 
177   // true iff we have already read the EOF token
178   bool mPastEof;
179   // true iff the last Check*() call has returned false, reverts to true on
180   // Rollback() call
181   bool mHasFailed;
182   // true if the input string is final (finished), false when we expect more
183   // data yet to be fed to the tokenizer (see IncrementalTokenizer derived
184   // class).
185   bool mInputFinished;
186   // custom only vs full tokenizing mode, see the Parse() method
187   Mode mMode;
188   // minimal raw data chunked delivery during incremental feed
189   uint32_t mMinRawDelivery;
190 
191   // Customizable list of whitespaces
192   const TChar* mWhitespaces;
193   // Additinal custom word characters
194   const TChar* mAdditionalWordChars;
195 
196   // All these point to the original buffer passed to the constructor or to the
197   // incremental buffer after FeedInput.
198   typename TAString::const_char_iterator
199       mCursor;  // Position of the current (actually next to read) token start
200   typename TAString::const_char_iterator mEnd;  // End of the input position
201 
202   // This is the list of tokens user has registered with AddCustomToken()
203   nsTArray<UniquePtr<Token>> mCustomTokens;
204   uint32_t mNextCustomTokenID;
205 
206  private:
207   TokenizerBase() = delete;
208   TokenizerBase(const TokenizerBase&) = delete;
209   TokenizerBase(TokenizerBase&&) = delete;
210   TokenizerBase(const TokenizerBase&&) = delete;
211   TokenizerBase& operator=(const TokenizerBase&) = delete;
212 };
213 
214 /**
215  * This is a simple implementation of a lexical analyzer or maybe better
216  * called a tokenizer.
217  *
218  * Please use Tokenizer or Tokenizer16 classes, that are specializations
219  * of this template class.  Tokenizer is for ASCII input, Tokenizer16 may
220  * handle char16_t input, but doesn't recognize whitespaces or numbers
221  * other than standard `char` specialized Tokenizer class.
222  */
223 template <typename TChar>
224 class TTokenizer : public TokenizerBase<TChar> {
225  public:
226   typedef TokenizerBase<TChar> base;
227 
228   /**
229    * @param aSource
230    *    The string to parse.
231    *    IMPORTANT NOTE: TTokenizer doesn't ensure the input string buffer
232    * lifetime. It's up to the consumer to make sure the string's buffer outlives
233    * the TTokenizer!
234    * @param aWhitespaces
235    *    If non-null TTokenizer will use this custom set of whitespaces for
236    * CheckWhite() and SkipWhites() calls. By default the list consists of space
237    * and tab.
238    * @param aAdditionalWordChars
239    *    If non-null it will be added to the list of characters that consist a
240    * word. This is useful when you want to accept e.g. '-' in HTTP headers. By
241    * default a word character is consider any character for which upper case
242    *    is different from lower case.
243    *
244    * If there is an overlap between aWhitespaces and aAdditionalWordChars, the
245    * check for word characters is made first.
246    */
247   explicit TTokenizer(const typename base::TAString& aSource,
248                       const TChar* aWhitespaces = nullptr,
249                       const TChar* aAdditionalWordChars = nullptr);
250   explicit TTokenizer(const TChar* aSource, const TChar* aWhitespaces = nullptr,
251                       const TChar* aAdditionalWordChars = nullptr);
252 
253   /**
254    * When there is still anything to read from the input, tokenize it, store the
255    * token type and value to aToken result and shift the cursor past this just
256    * parsed token.  Each call to Next() reads another token from the input and
257    * shifts the cursor. Returns false if we have passed the end of the input.
258    */
259   [[nodiscard]] bool Next(typename base::Token& aToken);
260 
261   /**
262    * Parse the token on the input read cursor position, check its type is equal
263    * to aTokenType and if so, put it into aResult, shift the cursor and return
264    * true.  Otherwise, leave the input read cursor position intact and return
265    * false.
266    */
267   [[nodiscard]] bool Check(const typename base::TokenType aTokenType,
268                            typename base::Token& aResult);
269   /**
270    * Same as above method, just compares both token type and token value passed
271    * in aToken. When both the type and the value equals, shift the cursor and
272    * return true.  Otherwise return false.
273    */
274   [[nodiscard]] bool Check(const typename base::Token& aToken);
275 
276   /**
277    * SkipWhites method (below) may also skip new line characters automatically.
278    */
279   enum WhiteSkipping {
280     /**
281      * SkipWhites will only skip what is defined as a white space (default).
282      */
283     DONT_INCLUDE_NEW_LINE = 0,
284     /**
285      * SkipWhites will skip definited white spaces as well as new lines
286      * automatically.
287      */
288     INCLUDE_NEW_LINE = 1
289   };
290 
291   /**
292    * Skips any occurence of whitespaces specified in mWhitespaces member,
293    * optionally skip also new lines.
294    */
295   void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE);
296 
297   /**
298    * Skips all tokens until the given one is found or EOF is hit.  The token
299    * or EOF are next to read.
300    */
301   void SkipUntil(typename base::Token const& aToken);
302 
303   // These are mostly shortcuts for the Check() methods above.
304 
305   /**
306    * Check whitespace character is present.
307    */
CheckWhite()308   [[nodiscard]] bool CheckWhite() { return Check(base::Token::Whitespace()); }
309   /**
310    * Check there is a single character on the read cursor position.  If so,
311    * shift the read cursor position and return true.  Otherwise false.
312    */
CheckChar(const TChar aChar)313   [[nodiscard]] bool CheckChar(const TChar aChar) {
314     return Check(base::Token::Char(aChar));
315   }
316   /**
317    * This is a customizable version of CheckChar.  aClassifier is a function
318    * called with value of the character on the current input read position.  If
319    * this user function returns true, read cursor is shifted and true returned.
320    * Otherwise false. The user classifiction function is not called when we are
321    * at or past the end and false is immediately returned.
322    */
323   [[nodiscard]] bool CheckChar(bool (*aClassifier)(const TChar aChar));
324   /**
325    * Check for a whole expected word.
326    */
CheckWord(const typename base::TAString & aWord)327   [[nodiscard]] bool CheckWord(const typename base::TAString& aWord) {
328     return Check(base::Token::Word(aWord));
329   }
330   /**
331    * Shortcut for literal const word check with compile time length calculation.
332    */
333   template <uint32_t N>
CheckWord(const TChar (& aWord)[N])334   [[nodiscard]] bool CheckWord(const TChar (&aWord)[N]) {
335     return Check(
336         base::Token::Word(typename base::TDependentString(aWord, N - 1)));
337   }
338   /**
339    * Helper to check for a string compound of multiple tokens like "foo bar".
340    * The match is binary-exact, a white space or a delimiter character in the
341    * phrase must match exactly the characters in the input.
342    */
343   [[nodiscard]] bool CheckPhrase(const typename base::TAString& aPhrase);
344   template <uint32_t N>
CheckPhrase(const TChar (& aPhrase)[N])345   [[nodiscard]] bool CheckPhrase(const TChar (&aPhrase)[N]) {
346     return CheckPhrase(typename base::TDependentString(aPhrase, N - 1));
347   }
348   /**
349    * Checks \r, \n or \r\n.
350    */
CheckEOL()351   [[nodiscard]] bool CheckEOL() { return Check(base::Token::NewLine()); }
352   /**
353    * Checks we are at the end of the input string reading.  If so, shift past
354    * the end and returns true.  Otherwise does nothing and returns false.
355    */
CheckEOF()356   [[nodiscard]] bool CheckEOF() { return Check(base::Token::EndOfFile()); }
357 
358   /**
359    * These are shortcuts to obtain the value immediately when the token type
360    * matches.
361    */
362   [[nodiscard]] bool ReadChar(TChar* aValue);
363   [[nodiscard]] bool ReadChar(bool (*aClassifier)(const TChar aChar),
364                               TChar* aValue);
365   [[nodiscard]] bool ReadWord(typename base::TAString& aValue);
366   [[nodiscard]] bool ReadWord(typename base::TDependentSubstring& aValue);
367 
368   /**
369    * This is an integer read helper.  It returns false and doesn't move the read
370    * cursor when any of the following happens:
371    *  - the token at the read cursor is not an integer
372    *  - the final number doesn't fit the T type
373    * Otherwise true is returned, aValue is filled with the integral number
374    * and the cursor is moved forward.
375    */
376   template <typename T>
ReadInteger(T * aValue)377   [[nodiscard]] bool ReadInteger(T* aValue) {
378     MOZ_RELEASE_ASSERT(aValue);
379 
380     typename base::TAString::const_char_iterator rollback = mRollback;
381     typename base::TAString::const_char_iterator cursor = base::mCursor;
382     typename base::Token t;
383     if (!Check(base::TOKEN_INTEGER, t)) {
384       return false;
385     }
386 
387     mozilla::CheckedInt<T> checked(t.AsInteger());
388     if (!checked.isValid()) {
389       // Move to a state as if Check() call has failed
390       mRollback = rollback;
391       base::mCursor = cursor;
392       base::mHasFailed = true;
393       return false;
394     }
395 
396     *aValue = checked.value();
397     return true;
398   }
399 
400   /**
401    * Same as above, but accepts an integer with an optional minus sign.
402    */
403   template <typename T, typename V = std::enable_if_t<
404                             std::is_signed_v<std::remove_pointer_t<T>>,
405                             std::remove_pointer_t<T>>>
ReadSignedInteger(T * aValue)406   [[nodiscard]] bool ReadSignedInteger(T* aValue) {
407     MOZ_RELEASE_ASSERT(aValue);
408 
409     typename base::TAString::const_char_iterator rollback = mRollback;
410     typename base::TAString::const_char_iterator cursor = base::mCursor;
411     auto revert = MakeScopeExit([&] {
412       // Move to a state as if Check() call has failed
413       mRollback = rollback;
414       base::mCursor = cursor;
415       base::mHasFailed = true;
416     });
417 
418     // Using functional raw access because '-' could be part of the word set
419     // making CheckChar('-') not work.
420     bool minus = CheckChar([](const TChar aChar) { return aChar == '-'; });
421 
422     typename base::Token t;
423     if (!Check(base::TOKEN_INTEGER, t)) {
424       return false;
425     }
426 
427     mozilla::CheckedInt<T> checked(t.AsInteger());
428     if (minus) {
429       checked *= -1;
430     }
431 
432     if (!checked.isValid()) {
433       return false;
434     }
435 
436     *aValue = checked.value();
437     revert.release();
438     return true;
439   }
440 
441   /**
442    * Returns the read cursor position back as it was before the last call of any
443    * parsing method of TTokenizer (Next, Check*, Skip*, Read*) so that the last
444    * operation can be repeated. Rollback cannot be used multiple times, it only
445    * reverts the last successfull parse operation.  It also cannot be used
446    * before any parsing operation has been called on the TTokenizer.
447    */
448   void Rollback();
449 
450   /**
451    * Record() and Claim() are collecting the input as it is being parsed to
452    * obtain a substring between particular syntax bounderies defined by any
453    * recursive descent parser or simple parser the TTokenizer is used to read
454    * the input for. Inlucsion of a token that has just been parsed can be
455    * controlled using an arguemnt.
456    */
457   enum ClaimInclusion {
458     /**
459      * Include resulting (or passed) token of the last lexical analyzer
460      * operation in the result.
461      */
462     INCLUDE_LAST,
463     /**
464      * Do not include it.
465      */
466     EXCLUDE_LAST
467   };
468 
469   /**
470    * Start the process of recording.  Based on aInclude value the begining of
471    * the recorded sub-string is at the current position (EXCLUDE_LAST) or at the
472    * position before the last parsed token (INCLUDE_LAST).
473    */
474   void Record(ClaimInclusion aInclude = EXCLUDE_LAST);
475   /**
476    * Claim result of the record started with Record() call before.  Depending on
477    * aInclude the ending of the sub-string result includes or excludes the last
478    * parsed or checked token.
479    */
480   void Claim(typename base::TAString& aResult,
481              ClaimInclusion aInclude = EXCLUDE_LAST);
482   void Claim(typename base::TDependentSubstring& aResult,
483              ClaimInclusion aInclude = EXCLUDE_LAST);
484 
485   /**
486    * If aToken is found, aResult is set to the substring between the current
487    * position and the position of aToken, potentially including aToken depending
488    * on aInclude.
489    * If aToken isn't found aResult is set to the substring between the current
490    * position and the end of the string.
491    * If aToken is found, the method returns true. Otherwise it returns false.
492    *
493    * Calling Rollback() after ReadUntil() will return the read cursor to the
494    * position it had before ReadUntil was called.
495    */
496   [[nodiscard]] bool ReadUntil(typename base::Token const& aToken,
497                                typename base::TDependentSubstring& aResult,
498                                ClaimInclusion aInclude = EXCLUDE_LAST);
499   [[nodiscard]] bool ReadUntil(typename base::Token const& aToken,
500                                typename base::TAString& aResult,
501                                ClaimInclusion aInclude = EXCLUDE_LAST);
502 
503  protected:
504   // All these point to the original buffer passed to the TTokenizer's
505   // constructor
506   typename base::TAString::const_char_iterator
507       mRecord;  // Position where the recorded sub-string for Claim() is
508   typename base::TAString::const_char_iterator
509       mRollback;  // Position of the previous token start
510 
511  private:
512   TTokenizer() = delete;
513   TTokenizer(const TTokenizer&) = delete;
514   TTokenizer(TTokenizer&&) = delete;
515   TTokenizer(const TTokenizer&&) = delete;
516   TTokenizer& operator=(const TTokenizer&) = delete;
517 };
518 
519 typedef TTokenizer<char> Tokenizer;
520 typedef TTokenizer<char16_t> Tokenizer16;
521 
522 }  // namespace mozilla
523 
524 #endif  // Tokenizer_h__
525