1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #ifndef Tokenizer_h__
8 #define Tokenizer_h__
9 
10 #include "nsString.h"
11 #include "mozilla/CheckedInt.h"
12 #include "mozilla/UniquePtr.h"
13 #include "nsTArray.h"
14 
15 namespace mozilla {
16 
17 class TokenizerBase
18 {
19 public:
20   /**
21    * The analyzer works with elements in the input cut to a sequence of token
22    * where each token has an elementary type
23    */
24   enum TokenType : uint32_t
25   {
26     TOKEN_UNKNOWN,
27     TOKEN_RAW,
28     TOKEN_ERROR,
29     TOKEN_INTEGER,
30     TOKEN_WORD,
31     TOKEN_CHAR,
32     TOKEN_WS,
33     TOKEN_EOL,
34     TOKEN_EOF,
35     TOKEN_CUSTOM0 = 1000
36   };
37 
38   enum ECaseSensitivity
39   {
40     CASE_SENSITIVE,
41     CASE_INSENSITIVE
42   };
43 
44   /**
45    * Class holding the type and the value of a token.  It can be manually created
46    * to allow checks against it via methods of Tokenizer or are results of some of
47    * the Tokenizer's methods.
48    */
49   class Token
50   {
51     TokenType mType;
52     nsDependentCSubstring mWord;
53     nsCString mCustom;
54     char mChar;
55     uint64_t mInteger;
56     ECaseSensitivity mCustomCaseInsensitivity;
57     bool mCustomEnabled;
58 
59     // If this token is a result of the parsing process, this member is referencing
60     // a sub-string in the input buffer.  If this is externally created Token this
61     // member is left an empty string.
62     nsDependentCSubstring mFragment;
63 
64     friend class TokenizerBase;
65     void AssignFragment(nsACString::const_char_iterator begin,
66                         nsACString::const_char_iterator end);
67 
68     static Token Raw();
69 
70   public:
71     Token();
72     Token(const Token& aOther);
73     Token& operator=(const Token& aOther);
74 
75     // Static constructors of tokens by type and value
76     static Token Word(const nsACString& aWord);
77     static Token Char(const char aChar);
78     static Token Number(const uint64_t aNumber);
79     static Token Whitespace();
80     static Token NewLine();
81     static Token EndOfFile();
82     static Token Error();
83 
84     // Compares the two tokens, type must be identical and value
85     // of one of the tokens must be 'any' or equal.
86     bool Equals(const Token& aOther) const;
87 
Type()88     TokenType Type() const { return mType; }
89     char AsChar() const;
90     nsDependentCSubstring AsString() const;
91     uint64_t AsInteger() const;
92 
Fragment()93     nsDependentCSubstring Fragment() const { return mFragment; }
94   };
95 
96   /**
97    * Consumers may register a custom string that, when found in the input, is considered
98    * a token and returned by Next*() and accepted by Check*() methods.
99    * AddCustomToken() returns a reference to a token that can then be comapred using
100    * Token::Equals() againts the output from Next*() or be passed to Check*().
101    */
102   Token AddCustomToken(const nsACString& aValue, ECaseSensitivity aCaseInsensitivity, bool aEnabled = true);
103   template <uint32_t N>
104   Token AddCustomToken(const char(&aValue)[N], ECaseSensitivity aCaseInsensitivity, bool aEnabled = true)
105   {
106     return AddCustomToken(nsDependentCSubstring(aValue, N - 1), aCaseInsensitivity, aEnabled);
107   }
108   void RemoveCustomToken(Token& aToken);
109   /**
110    * Only applies to a custom type of a Token (see AddCustomToken above.)
111    * This turns on and off token recognition.  When a custom token is disabled,
112    * it's ignored as never added as a custom token.
113    */
114   void EnableCustomToken(Token const& aToken, bool aEnable);
115 
116   /**
117    * Mode of tokenization.
118    * FULL tokenization, the default, recognizes built-in tokens and any custom tokens,
119    * if added.
120    * CUSTOM_ONLY will only recognize custom tokens, the rest is seen as 'raw'.
121    * This mode can be understood as a 'binary' mode.
122    */
123   enum class Mode
124   {
125     FULL,
126     CUSTOM_ONLY
127   };
128   void SetTokenizingMode(Mode aMode);
129 
130   /**
131    * Return false iff the last Check*() call has returned false or when we've read past
132    * the end of the input string.
133    */
134   MOZ_MUST_USE bool HasFailed() const;
135 
136 protected:
137   explicit TokenizerBase(const char* aWhitespaces = nullptr,
138                          const char* aAdditionalWordChars = nullptr);
139 
140   // false if we have already read the EOF token.
141   bool HasInput() const;
142   // Main parsing function, it doesn't shift the read cursor, just returns the next
143   // token position.
144   nsACString::const_char_iterator Parse(Token& aToken) const;
145   // Is read cursor at the end?
146   bool IsEnd(const nsACString::const_char_iterator& caret) const;
147   // True, when we are at the end of the input data, but it has not been marked
148   // as complete yet.  In that case we cannot proceed with providing a multi-char token.
149   bool IsPending(const nsACString::const_char_iterator & caret) const;
150   // Is read cursor on a character that is a word start?
151   bool IsWordFirst(const char aInput) const;
152   // Is read cursor on a character that is an in-word letter?
153   bool IsWord(const char aInput) const;
154   // Is read cursor on a character that is a valid number?
155   // TODO - support multiple radix
156   bool IsNumber(const char aInput) const;
157   // Is equal to the given custom token?
158   bool IsCustom(const nsACString::const_char_iterator& caret,
159                 const Token& aCustomToken, uint32_t* aLongest = nullptr) const;
160 
161   // Friendly helper to assign a fragment on a Token
162   static void AssignFragment(Token& aToken,
163                              nsACString::const_char_iterator begin,
164                              nsACString::const_char_iterator end);
165 
166   // true iff we have already read the EOF token
167   bool mPastEof;
168   // true iff the last Check*() call has returned false, reverts to true on Rollback() call
169   bool mHasFailed;
170   // true if the input string is final (finished), false when we expect more data
171   // yet to be fed to the tokenizer (see IncrementalTokenizer derived class).
172   bool mInputFinished;
173   // custom only vs full tokenizing mode, see the Parse() method
174   Mode mMode;
175   // minimal raw data chunked delivery during incremental feed
176   uint32_t mMinRawDelivery;
177 
178   // Customizable list of whitespaces
179   const char* mWhitespaces;
180   // Additinal custom word characters
181   const char* mAdditionalWordChars;
182 
183   // All these point to the original buffer passed to the constructor or to the incremental
184   // buffer after FeedInput.
185   nsACString::const_char_iterator mCursor; // Position of the current (actually next to read) token start
186   nsACString::const_char_iterator mEnd; // End of the input position
187 
188   // This is the list of tokens user has registered with AddCustomToken()
189   nsTArray<UniquePtr<Token>> mCustomTokens;
190   uint32_t mNextCustomTokenID;
191 
192 private:
193   TokenizerBase() = delete;
194   TokenizerBase(const TokenizerBase&) = delete;
195   TokenizerBase(TokenizerBase&&) = delete;
196   TokenizerBase(const TokenizerBase&&) = delete;
197   TokenizerBase &operator=(const TokenizerBase&) = delete;
198 };
199 
200 /**
201  * This is a simple implementation of a lexical analyzer or maybe better
202  * called a tokenizer.  It doesn't allow any user dictionaries or
203  * user define token types.
204  *
205  * It is limited only to ASCII input for now. UTF-8 or any other input
206  * encoding must yet be implemented.
207  */
208 class Tokenizer : public TokenizerBase
209 {
210 public:
211   /**
212    * @param aSource
213    *    The string to parse.
214    *    IMPORTANT NOTE: Tokenizer doesn't ensure the input string buffer lifetime.
215    *    It's up to the consumer to make sure the string's buffer outlives the Tokenizer!
216    * @param aWhitespaces
217    *    If non-null Tokenizer will use this custom set of whitespaces for CheckWhite()
218    *    and SkipWhites() calls.
219    *    By default the list consists of space and tab.
220    * @param aAdditionalWordChars
221    *    If non-null it will be added to the list of characters that consist a word.
222    *    This is useful when you want to accept e.g. '-' in HTTP headers.
223    *    By default a word character is consider any character for which upper case
224    *    is different from lower case.
225    *
226    * If there is an overlap between aWhitespaces and aAdditionalWordChars, the check for
227    * word characters is made first.
228    */
229   explicit Tokenizer(const nsACString& aSource,
230                      const char* aWhitespaces = nullptr,
231                      const char* aAdditionalWordChars = nullptr);
232   explicit Tokenizer(const char* aSource,
233                      const char* aWhitespaces = nullptr,
234                      const char* aAdditionalWordChars = nullptr);
235 
236   /**
237    * When there is still anything to read from the input, tokenize it, store the token type
238    * and value to aToken result and shift the cursor past this just parsed token.  Each call
239    * to Next() reads another token from the input and shifts the cursor.
240    * Returns false if we have passed the end of the input.
241    */
242   MOZ_MUST_USE
243   bool Next(Token& aToken);
244 
245   /**
246    * Parse the token on the input read cursor position, check its type is equal to aTokenType
247    * and if so, put it into aResult, shift the cursor and return true.  Otherwise, leave
248    * the input read cursor position intact and return false.
249    */
250   MOZ_MUST_USE
251   bool Check(const TokenType aTokenType, Token& aResult);
252   /**
253    * Same as above method, just compares both token type and token value passed in aToken.
254    * When both the type and the value equals, shift the cursor and return true.  Otherwise
255    * return false.
256    */
257   MOZ_MUST_USE
258   bool Check(const Token& aToken);
259 
260   /**
261    * SkipWhites method (below) may also skip new line characters automatically.
262    */
263   enum WhiteSkipping {
264     /**
265      * SkipWhites will only skip what is defined as a white space (default).
266      */
267     DONT_INCLUDE_NEW_LINE = 0,
268     /**
269      * SkipWhites will skip definited white spaces as well as new lines
270      * automatically.
271      */
272     INCLUDE_NEW_LINE = 1
273   };
274 
275   /**
276    * Skips any occurence of whitespaces specified in mWhitespaces member,
277    * optionally skip also new lines.
278    */
279   void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE);
280 
281   /**
282    * Skips all tokens until the given one is found or EOF is hit.  The token
283    * or EOF are next to read.
284    */
285   void SkipUntil(Token const& aToken);
286 
287   // These are mostly shortcuts for the Check() methods above.
288 
289   /**
290    * Check whitespace character is present.
291    */
292   MOZ_MUST_USE
CheckWhite()293   bool CheckWhite() { return Check(Token::Whitespace()); }
294   /**
295    * Check there is a single character on the read cursor position.  If so, shift the read
296    * cursor position and return true.  Otherwise false.
297    */
298   MOZ_MUST_USE
CheckChar(const char aChar)299   bool CheckChar(const char aChar) { return Check(Token::Char(aChar)); }
300   /**
301    * This is a customizable version of CheckChar.  aClassifier is a function called with
302    * value of the character on the current input read position.  If this user function
303    * returns true, read cursor is shifted and true returned.  Otherwise false.
304    * The user classifiction function is not called when we are at or past the end and
305    * false is immediately returned.
306    */
307   MOZ_MUST_USE
308   bool CheckChar(bool (*aClassifier)(const char aChar));
309   /**
310    * Check for a whole expected word.
311    */
312   MOZ_MUST_USE
CheckWord(const nsACString & aWord)313   bool CheckWord(const nsACString& aWord) { return Check(Token::Word(aWord)); }
314   /**
315    * Shortcut for literal const word check with compile time length calculation.
316    */
317   template <uint32_t N>
318   MOZ_MUST_USE
CheckWord(const char (& aWord)[N])319   bool CheckWord(const char (&aWord)[N]) { return Check(Token::Word(nsDependentCString(aWord, N - 1))); }
320   /**
321    * Checks \r, \n or \r\n.
322    */
323   MOZ_MUST_USE
CheckEOL()324   bool CheckEOL() { return Check(Token::NewLine()); }
325   /**
326    * Checks we are at the end of the input string reading.  If so, shift past the end
327    * and returns true.  Otherwise does nothing and returns false.
328    */
329   MOZ_MUST_USE
CheckEOF()330   bool CheckEOF() { return Check(Token::EndOfFile()); }
331 
332   /**
333    * These are shortcuts to obtain the value immediately when the token type matches.
334    */
335   MOZ_MUST_USE bool ReadChar(char* aValue);
336   MOZ_MUST_USE bool ReadChar(bool (*aClassifier)(const char aChar),
337                              char* aValue);
338   MOZ_MUST_USE bool ReadWord(nsACString& aValue);
339   MOZ_MUST_USE bool ReadWord(nsDependentCSubstring& aValue);
340 
341   /**
342    * This is an integer read helper.  It returns false and doesn't move the read
343    * cursor when any of the following happens:
344    *  - the token at the read cursor is not an integer
345    *  - the final number doesn't fit the T type
346    * Otherwise true is returned, aValue is filled with the integral number
347    * and the cursor is moved forward.
348    */
349   template <typename T>
ReadInteger(T * aValue)350   MOZ_MUST_USE bool ReadInteger(T* aValue)
351   {
352     MOZ_RELEASE_ASSERT(aValue);
353 
354     nsACString::const_char_iterator rollback = mRollback;
355     nsACString::const_char_iterator cursor = mCursor;
356     Token t;
357     if (!Check(TOKEN_INTEGER, t)) {
358       return false;
359     }
360 
361     mozilla::CheckedInt<T> checked(t.AsInteger());
362     if (!checked.isValid()) {
363       // Move to a state as if Check() call has failed
364       mRollback = rollback;
365       mCursor = cursor;
366       mHasFailed = true;
367       return false;
368     }
369 
370     *aValue = checked.value();
371     return true;
372   }
373 
374   /**
375    * Returns the read cursor position back as it was before the last call of any parsing
376    * method of Tokenizer (Next, Check*, Skip*, Read*) so that the last operation
377    * can be repeated.
378    * Rollback cannot be used multiple times, it only reverts the last successfull parse
379    * operation.  It also cannot be used before any parsing operation has been called
380    * on the Tokenizer.
381    */
382   void Rollback();
383 
384   /**
385    * Record() and Claim() are collecting the input as it is being parsed to obtain
386    * a substring between particular syntax bounderies defined by any recursive
387    * descent parser or simple parser the Tokenizer is used to read the input for.
388    * Inlucsion of a token that has just been parsed can be controlled using an arguemnt.
389    */
390   enum ClaimInclusion {
391     /**
392      * Include resulting (or passed) token of the last lexical analyzer operation in the result.
393      */
394     INCLUDE_LAST,
395     /**
396      * Do not include it.
397      */
398     EXCLUDE_LAST
399   };
400 
401   /**
402    * Start the process of recording.  Based on aInclude value the begining of the recorded
403    * sub-string is at the current position (EXCLUDE_LAST) or at the position before the last
404    * parsed token (INCLUDE_LAST).
405    */
406   void Record(ClaimInclusion aInclude = EXCLUDE_LAST);
407   /**
408    * Claim result of the record started with Record() call before.  Depending on aInclude
409    * the ending of the sub-string result includes or excludes the last parsed or checked
410    * token.
411    */
412   void Claim(nsACString& aResult, ClaimInclusion aInclude = EXCLUDE_LAST);
413   void Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclude = EXCLUDE_LAST);
414 
415   /**
416    * If aToken is found, aResult is set to the substring between the current
417    * position and the position of aToken, potentially including aToken depending
418    * on aInclude.
419    * If aToken isn't found aResult is set to the substring between the current
420    * position and the end of the string.
421    * If aToken is found, the method returns true. Otherwise it returns false.
422    *
423    * Calling Rollback() after ReadUntil() will return the read cursor to the
424    * position it had before ReadUntil was called.
425    */
426   MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsDependentCSubstring& aResult,
427                               ClaimInclusion aInclude = EXCLUDE_LAST);
428   MOZ_MUST_USE bool ReadUntil(Token const& aToken, nsACString& aResult,
429                               ClaimInclusion aInclude = EXCLUDE_LAST);
430 
431 protected:
432   // All these point to the original buffer passed to the Tokenizer's constructor
433   nsACString::const_char_iterator mRecord; // Position where the recorded sub-string for Claim() is
434   nsACString::const_char_iterator mRollback; // Position of the previous token start
435 
436 private:
437   Tokenizer() = delete;
438   Tokenizer(const Tokenizer&) = delete;
439   Tokenizer(Tokenizer&&) = delete;
440   Tokenizer(const Tokenizer&&) = delete;
441   Tokenizer &operator=(const Tokenizer&) = delete;
442 };
443 
444 } // mozilla
445 
446 #endif // Tokenizer_h__
447