1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #ifndef Tokenizer_h__
8 #define Tokenizer_h__
9 
10 #include "nsString.h"
11 #include "mozilla/CheckedInt.h"
12 #include "mozilla/ScopeExit.h"
13 #include "mozilla/TypeTraits.h"
14 #include "mozilla/UniquePtr.h"
15 #include "nsTArray.h"
16 
17 namespace mozilla {
18 
19 class TokenizerBase {
20  public:
21   /**
22    * The analyzer works with elements in the input cut to a sequence of token
23    * where each token has an elementary type
24    */
25   enum TokenType : uint32_t {
26     TOKEN_UNKNOWN,
27     TOKEN_RAW,
28     TOKEN_ERROR,
29     TOKEN_INTEGER,
30     TOKEN_WORD,
31     TOKEN_CHAR,
32     TOKEN_WS,
33     TOKEN_EOL,
34     TOKEN_EOF,
35     TOKEN_CUSTOM0 = 1000
36   };
37 
38   enum ECaseSensitivity { CASE_SENSITIVE, CASE_INSENSITIVE };
39 
40   /**
41    * Class holding the type and the value of a token.  It can be manually
42    * created to allow checks against it via methods of Tokenizer or are results
43    * of some of the Tokenizer's methods.
44    */
45   class Token {
46     TokenType mType;
47     nsDependentCSubstring mWord;
48     nsCString mCustom;
49     char mChar;
50     uint64_t mInteger;
51     ECaseSensitivity mCustomCaseInsensitivity;
52     bool mCustomEnabled;
53 
54     // If this token is a result of the parsing process, this member is
55     // referencing a sub-string in the input buffer.  If this is externally
56     // created Token this member is left an empty string.
57     nsDependentCSubstring mFragment;
58 
59     friend class TokenizerBase;
60     void AssignFragment(nsACString::const_char_iterator begin,
61                         nsACString::const_char_iterator end);
62 
63     static Token Raw();
64 
65    public:
66     Token();
67     Token(const Token& aOther);
68     Token& operator=(const Token& aOther);
69 
70     // Static constructors of tokens by type and value
71     static Token Word(const nsACString& aWord);
72     static Token Char(const char aChar);
73     static Token Number(const uint64_t aNumber);
74     static Token Whitespace();
75     static Token NewLine();
76     static Token EndOfFile();
77     static Token Error();
78 
79     // Compares the two tokens, type must be identical and value
80     // of one of the tokens must be 'any' or equal.
81     bool Equals(const Token& aOther) const;
82 
Type()83     TokenType Type() const { return mType; }
84     char AsChar() const;
85     nsDependentCSubstring AsString() const;
86     uint64_t AsInteger() const;
87 
Fragment()88     nsDependentCSubstring Fragment() const { return mFragment; }
89   };
90 
91   /**
92    * Consumers may register a custom string that, when found in the input, is
93    * considered a token and returned by Next*() and accepted by Check*()
94    * methods. AddCustomToken() returns a reference to a token that can then be
95    * comapred using Token::Equals() againts the output from Next*() or be passed
96    * to Check*().
97    */
98   Token AddCustomToken(const nsACString& aValue,
99                        ECaseSensitivity aCaseInsensitivity,
100                        bool aEnabled = true);
101   template <uint32_t N>
102   Token AddCustomToken(const char (&aValue)[N],
103                        ECaseSensitivity aCaseInsensitivity,
104                        bool aEnabled = true) {
105     return AddCustomToken(nsDependentCSubstring(aValue, N - 1),
106                           aCaseInsensitivity, aEnabled);
107   }
108   void RemoveCustomToken(Token& aToken);
109   /**
110    * Only applies to a custom type of a Token (see AddCustomToken above.)
111    * This turns on and off token recognition.  When a custom token is disabled,
112    * it's ignored as never added as a custom token.
113    */
114   void EnableCustomToken(Token const& aToken, bool aEnable);
115 
116   /**
117    * Mode of tokenization.
118    * FULL tokenization, the default, recognizes built-in tokens and any custom
119    * tokens, if added. CUSTOM_ONLY will only recognize custom tokens, the rest
120    * is seen as 'raw'. This mode can be understood as a 'binary' mode.
121    */
122   enum class Mode { FULL, CUSTOM_ONLY };
123   void SetTokenizingMode(Mode aMode);
124 
125   /**
126    * Return false iff the last Check*() call has returned false or when we've
127    * read past the end of the input string.
128    */
129   MOZ_MUST_USE bool HasFailed() const;
130 
131  protected:
132   explicit TokenizerBase(const char* aWhitespaces = nullptr,
133                          const char* aAdditionalWordChars = nullptr);
134 
135   // false if we have already read the EOF token.
136   bool HasInput() const;
137   // Main parsing function, it doesn't shift the read cursor, just returns the
138   // next token position.
139   nsACString::const_char_iterator Parse(Token& aToken) const;
140   // Is read cursor at the end?
141   bool IsEnd(const nsACString::const_char_iterator& caret) const;
142   // True, when we are at the end of the input data, but it has not been marked
143   // as complete yet.  In that case we cannot proceed with providing a
144   // multi-char token.
145   bool IsPending(const nsACString::const_char_iterator& caret) const;
146   // Is read cursor on a character that is a word start?
147   bool IsWordFirst(const char aInput) const;
148   // Is read cursor on a character that is an in-word letter?
149   bool IsWord(const char aInput) const;
150   // Is read cursor on a character that is a valid number?
151   // TODO - support multiple radix
152   bool IsNumber(const char aInput) const;
153   // Is equal to the given custom token?
154   bool IsCustom(const nsACString::const_char_iterator& caret,
155                 const Token& aCustomToken, uint32_t* aLongest = nullptr) const;
156 
157   // Friendly helper to assign a fragment on a Token
158   static void AssignFragment(Token& aToken,
159                              nsACString::const_char_iterator begin,
160                              nsACString::const_char_iterator end);
161 
162   // true iff we have already read the EOF token
163   bool mPastEof;
164   // true iff the last Check*() call has returned false, reverts to true on
165   // Rollback() call
166   bool mHasFailed;
167   // true if the input string is final (finished), false when we expect more
168   // data yet to be fed to the tokenizer (see IncrementalTokenizer derived
169   // class).
170   bool mInputFinished;
171   // custom only vs full tokenizing mode, see the Parse() method
172   Mode mMode;
173   // minimal raw data chunked delivery during incremental feed
174   uint32_t mMinRawDelivery;
175 
176   // Customizable list of whitespaces
177   const char* mWhitespaces;
178   // Additinal custom word characters
179   const char* mAdditionalWordChars;
180 
181   // All these point to the original buffer passed to the constructor or to the
182   // incremental buffer after FeedInput.
183   nsACString::const_char_iterator
184       mCursor;  // Position of the current (actually next to read) token start
185   nsACString::const_char_iterator mEnd;  // End of the input position
186 
187   // This is the list of tokens user has registered with AddCustomToken()
188   nsTArray<UniquePtr<Token>> mCustomTokens;
189   uint32_t mNextCustomTokenID;
190 
191  private:
192   TokenizerBase() = delete;
193   TokenizerBase(const TokenizerBase&) = delete;
194   TokenizerBase(TokenizerBase&&) = delete;
195   TokenizerBase(const TokenizerBase&&) = delete;
196   TokenizerBase& operator=(const TokenizerBase&) = delete;
197 };
198 
199 /**
200  * This is a simple implementation of a lexical analyzer or maybe better
201  * called a tokenizer.  It doesn't allow any user dictionaries or
202  * user define token types.
203  *
204  * It is limited only to ASCII input for now. UTF-8 or any other input
205  * encoding must yet be implemented.
206  */
207 class Tokenizer : public TokenizerBase {
208  public:
209   /**
210    * @param aSource
211    *    The string to parse.
212    *    IMPORTANT NOTE: Tokenizer doesn't ensure the input string buffer
213    * lifetime. It's up to the consumer to make sure the string's buffer outlives
214    * the Tokenizer!
215    * @param aWhitespaces
216    *    If non-null Tokenizer will use this custom set of whitespaces for
217    * CheckWhite() and SkipWhites() calls. By default the list consists of space
218    * and tab.
219    * @param aAdditionalWordChars
220    *    If non-null it will be added to the list of characters that consist a
221    * word. This is useful when you want to accept e.g. '-' in HTTP headers. By
222    * default a word character is consider any character for which upper case
223    *    is different from lower case.
224    *
225    * If there is an overlap between aWhitespaces and aAdditionalWordChars, the
226    * check for word characters is made first.
227    */
228   explicit Tokenizer(const nsACString& aSource,
229                      const char* aWhitespaces = nullptr,
230                      const char* aAdditionalWordChars = nullptr);
231   explicit Tokenizer(const char* aSource, const char* aWhitespaces = nullptr,
232                      const char* aAdditionalWordChars = nullptr);
233 
234   /**
235    * When there is still anything to read from the input, tokenize it, store the
236    * token type and value to aToken result and shift the cursor past this just
237    * parsed token.  Each call to Next() reads another token from the input and
238    * shifts the cursor. Returns false if we have passed the end of the input.
239    */
240   MOZ_MUST_USE
241   bool Next(Token& aToken);
242 
243   /**
244    * Parse the token on the input read cursor position, check its type is equal
245    * to aTokenType and if so, put it into aResult, shift the cursor and return
246    * true.  Otherwise, leave the input read cursor position intact and return
247    * false.
248    */
249   MOZ_MUST_USE
250   bool Check(const TokenType aTokenType, Token& aResult);
251   /**
252    * Same as above method, just compares both token type and token value passed
253    * in aToken. When both the type and the value equals, shift the cursor and
254    * return true.  Otherwise return false.
255    */
256   MOZ_MUST_USE
257   bool Check(const Token& aToken);
258 
259   /**
260    * SkipWhites method (below) may also skip new line characters automatically.
261    */
262   enum WhiteSkipping {
263     /**
264      * SkipWhites will only skip what is defined as a white space (default).
265      */
266     DONT_INCLUDE_NEW_LINE = 0,
267     /**
268      * SkipWhites will skip definited white spaces as well as new lines
269      * automatically.
270      */
271     INCLUDE_NEW_LINE = 1
272   };
273 
274   /**
275    * Skips any occurence of whitespaces specified in mWhitespaces member,
276    * optionally skip also new lines.
277    */
278   void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE);
279 
280   /**
281    * Skips all tokens until the given one is found or EOF is hit.  The token
282    * or EOF are next to read.
283    */
284   void SkipUntil(Token const& aToken);
285 
286   // These are mostly shortcuts for the Check() methods above.
287 
288   /**
289    * Check whitespace character is present.
290    */
291   MOZ_MUST_USE
CheckWhite()292   bool CheckWhite() { return Check(Token::Whitespace()); }
293   /**
294    * Check there is a single character on the read cursor position.  If so,
295    * shift the read cursor position and return true.  Otherwise false.
296    */
297   MOZ_MUST_USE
CheckChar(const char aChar)298   bool CheckChar(const char aChar) { return Check(Token::Char(aChar)); }
299   /**
300    * This is a customizable version of CheckChar.  aClassifier is a function
301    * called with value of the character on the current input read position.  If
302    * this user function returns true, read cursor is shifted and true returned.
303    * Otherwise false. The user classifiction function is not called when we are
304    * at or past the end and false is immediately returned.
305    */
306   MOZ_MUST_USE
307   bool CheckChar(bool (*aClassifier)(const char aChar));
308   /**
309    * Check for a whole expected word.
310    */
311   MOZ_MUST_USE
CheckWord(const nsACString & aWord)312   bool CheckWord(const nsACString& aWord) { return Check(Token::Word(aWord)); }
313   /**
314    * Shortcut for literal const word check with compile time length calculation.
315    */
316   template <uint32_t N>
CheckWord(const char (& aWord)[N])317   MOZ_MUST_USE bool CheckWord(const char (&aWord)[N]) {
318     return Check(Token::Word(nsDependentCString(aWord, N - 1)));
319   }
320   /**
321    * Checks \r, \n or \r\n.
322    */
323   MOZ_MUST_USE
CheckEOL()324   bool CheckEOL() { return Check(Token::NewLine()); }
325   /**
326    * Checks we are at the end of the input string reading.  If so, shift past
327    * the end and returns true.  Otherwise does nothing and returns false.
328    */
329   MOZ_MUST_USE
CheckEOF()330   bool CheckEOF() { return Check(Token::EndOfFile()); }
331 
332   /**
333    * These are shortcuts to obtain the value immediately when the token type
334    * matches.
335    */
336   MOZ_MUST_USE bool ReadChar(char* aValue);
337   MOZ_MUST_USE bool ReadChar(bool (*aClassifier)(const char aChar),
338                              char* aValue);
339   MOZ_MUST_USE bool ReadWord(nsACString& aValue);
340   MOZ_MUST_USE bool ReadWord(nsDependentCSubstring& aValue);
341 
342   /**
343    * This is an integer read helper.  It returns false and doesn't move the read
344    * cursor when any of the following happens:
345    *  - the token at the read cursor is not an integer
346    *  - the final number doesn't fit the T type
347    * Otherwise true is returned, aValue is filled with the integral number
348    * and the cursor is moved forward.
349    */
350   template <typename T>
ReadInteger(T * aValue)351   MOZ_MUST_USE bool ReadInteger(T* aValue) {
352     MOZ_RELEASE_ASSERT(aValue);
353 
354     nsACString::const_char_iterator rollback = mRollback;
355     nsACString::const_char_iterator cursor = mCursor;
356     Token t;
357     if (!Check(TOKEN_INTEGER, t)) {
358       return false;
359     }
360 
361     mozilla::CheckedInt<T> checked(t.AsInteger());
362     if (!checked.isValid()) {
363       // Move to a state as if Check() call has failed
364       mRollback = rollback;
365       mCursor = cursor;
366       mHasFailed = true;
367       return false;
368     }
369 
370     *aValue = checked.value();
371     return true;
372   }
373 
374   /**
375    * Same as above, but accepts an integer with an optional minus sign.
376    */
377   template <typename T, typename V = typename EnableIf<
378                             IsSigned<typename RemovePointer<T>::Type>::value,
379                             typename RemovePointer<T>::Type>::Type>
ReadSignedInteger(T * aValue)380   MOZ_MUST_USE bool ReadSignedInteger(T* aValue) {
381     MOZ_RELEASE_ASSERT(aValue);
382 
383     nsACString::const_char_iterator rollback = mRollback;
384     nsACString::const_char_iterator cursor = mCursor;
385     auto revert = MakeScopeExit([&] {
386       // Move to a state as if Check() call has failed
387       mRollback = rollback;
388       mCursor = cursor;
389       mHasFailed = true;
390     });
391 
392     // Using functional raw access because '-' could be part of the word set
393     // making CheckChar('-') not work.
394     bool minus = CheckChar([](const char aChar) { return aChar == '-'; });
395 
396     Token t;
397     if (!Check(TOKEN_INTEGER, t)) {
398       return false;
399     }
400 
401     mozilla::CheckedInt<T> checked(t.AsInteger());
402     if (minus) {
403       checked *= -1;
404     }
405 
406     if (!checked.isValid()) {
407       return false;
408     }
409 
410     *aValue = checked.value();
411     revert.release();
412     return true;
413   }
414 
415   /**
416    * Returns the read cursor position back as it was before the last call of any
417    * parsing method of Tokenizer (Next, Check*, Skip*, Read*) so that the last
418    * operation can be repeated. Rollback cannot be used multiple times, it only
419    * reverts the last successfull parse operation.  It also cannot be used
420    * before any parsing operation has been called on the Tokenizer.
421    */
422   void Rollback();
423 
424   /**
425    * Record() and Claim() are collecting the input as it is being parsed to
426    * obtain a substring between particular syntax bounderies defined by any
427    * recursive descent parser or simple parser the Tokenizer is used to read the
428    * input for. Inlucsion of a token that has just been parsed can be controlled
429    * using an arguemnt.
430    */
431   enum ClaimInclusion {
432     /**
433      * Include resulting (or passed) token of the last lexical analyzer
434      * operation in the result.
435      */
436     INCLUDE_LAST,
437     /**
438      * Do not include it.
439      */
440     EXCLUDE_LAST
441   };
442 
443   /**
444    * Start the process of recording.  Based on aInclude value the begining of
445    * the recorded sub-string is at the current position (EXCLUDE_LAST) or at the
446    * position before the last parsed token (INCLUDE_LAST).
447    */
448   void Record(ClaimInclusion aInclude = EXCLUDE_LAST);
449   /**
450    * Claim result of the record started with Record() call before.  Depending on
451    * aInclude the ending of the sub-string result includes or excludes the last
452    * parsed or checked token.
453    */
454   void Claim(nsACString& aResult, ClaimInclusion aInclude = EXCLUDE_LAST);
455   void Claim(nsDependentCSubstring& aResult,
456              ClaimInclusion aInclude = EXCLUDE_LAST);
457 
458   /**
459    * If aToken is found, aResult is set to the substring between the current
460    * position and the position of aToken, potentially including aToken depending
461    * on aInclude.
462    * If aToken isn't found aResult is set to the substring between the current
463    * position and the end of the string.
464    * If aToken is found, the method returns true. Otherwise it returns false.
465    *
466    * Calling Rollback() after ReadUntil() will return the read cursor to the
467    * position it had before ReadUntil was called.
468    */
469   MOZ_MUST_USE bool ReadUntil(Token const& aToken,
470                               nsDependentCSubstring& aResult,
471                               ClaimInclusion aInclude = EXCLUDE_LAST);
472   MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsACString& aResult,
473                               ClaimInclusion aInclude = EXCLUDE_LAST);
474 
475  protected:
476   // All these point to the original buffer passed to the Tokenizer's
477   // constructor
478   nsACString::const_char_iterator
479       mRecord;  // Position where the recorded sub-string for Claim() is
480   nsACString::const_char_iterator
481       mRollback;  // Position of the previous token start
482 
483  private:
484   Tokenizer() = delete;
485   Tokenizer(const Tokenizer&) = delete;
486   Tokenizer(Tokenizer&&) = delete;
487   Tokenizer(const Tokenizer&&) = delete;
488   Tokenizer& operator=(const Tokenizer&) = delete;
489 };
490 
491 }  // namespace mozilla
492 
493 #endif  // Tokenizer_h__
494