1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #ifndef INCREMENTAL_TOKENIZER_H__
8 #define INCREMENTAL_TOKENIZER_H__
9 
10 #include "mozilla/Tokenizer.h"
11 
12 #include "nsError.h"
13 #include <functional>
14 
15 class nsIInputStream;
16 
17 namespace mozilla {
18 
19 class IncrementalTokenizer : public TokenizerBase {
20  public:
21   /**
22    * The consumer callback.  The function is called for every single token
23    * as found in the input.  Failure result returned by this callback stops
24    * the tokenization immediately and bubbles to result of Feed/FinishInput.
25    *
26    * Fragment()s of consumed tokens are ensured to remain valid until next call
27    * to Feed/FinishInput and are pointing to a single linear buffer.  Hence,
28    * those can be safely used to accumulate the data for processing after
29    * Feed/FinishInput returned.
30    */
31   typedef std::function<nsresult(Token const&, IncrementalTokenizer& i)>
32       Consumer;
33 
34   /**
35    * For aWhitespaces and aAdditionalWordChars arguments see TokenizerBase.
36    *
37    * @param aConsumer
38    *    A mandatory non-null argument, a function that consumes the tokens as
39    * they come when the tokenizer is fed.
40    * @param aRawMinBuffered
41    *    When we have buffered at least aRawMinBuffered data, but there was no
42    * custom token found so far because of too small incremental feed chunks,
43    * deliver the raw data to preserve streaming and to save memory.  This
44    * only has effect in OnlyCustomTokenizing mode.
45    */
46   explicit IncrementalTokenizer(Consumer&& aConsumer,
47                                 const char* aWhitespaces = nullptr,
48                                 const char* aAdditionalWordChars = nullptr,
49                                 uint32_t aRawMinBuffered = 1024);
50 
51   /**
52    * Pushes the input to be tokenized.  These directly call the Consumer
53    * callback on every found token.  Result of the Consumer callback is returned
54    * here.
55    *
56    * The tokenizer must be initialized with a valid consumer prior call to these
57    * methods.  It's not allowed to call Feed/FinishInput from inside the
58    * Consumer callback.
59    */
60   nsresult FeedInput(const nsACString& aInput);
61   nsresult FeedInput(nsIInputStream* aInput, uint32_t aCount);
62   nsresult FinishInput();
63 
64   /**
65    * Can only be called from inside the consumer callback.
66    *
67    * When there is still anything to read from the input, tokenize it, store
68    * the token type and value to aToken result and shift the cursor past this
69    * just parsed token.  Each call to Next() reads another token from
70    * the input and shifts the cursor.
71    *
72    * Returns false if there is not enough data to deterministically recognize
73    * tokens or when the last returned token was EOF.
74    */
75   MOZ_MUST_USE
76   bool Next(Token& aToken);
77 
78   /**
79    * Can only be called from inside the consumer callback.
80    *
81    * Tells the tokenizer to revert the cursor and stop the async parsing until
82    * next feed of the input.  This is useful when more than one token is needed
83    * to decide on the syntax but there is not enough input to get a next token
84    * (Next() returned false.)
85    */
86   void NeedMoreInput();
87 
88   /**
89    * Can only be called from inside the consumer callback.
90    *
91    * This makes the consumer callback be called again while parsing
92    * the input at the previous cursor position again.  This is useful when
93    * the tokenizer state (custom tokens, tokenization mode) has changed and
94    * we want to re-parse the input again.
95    */
96   void Rollback();
97 
98  private:
99   // Loops over the input with TokenizerBase::Parse and calls the Consumer
100   // callback.
101   nsresult Process();
102 
103 #ifdef DEBUG
104   // True when inside the consumer callback, used only for assertions.
105   bool mConsuming;
106 #endif  // DEBUG
107   // Modifyable only from the Consumer callback, tells the parser to break,
108   // rollback and wait for more input.
109   bool mNeedMoreInput;
110   // Modifyable only from the Consumer callback, tells the parser to rollback
111   // and parse the input again, with (if modified) new settings of the
112   // tokenizer.
113   bool mRollback;
114   // The input buffer.  Updated with each call to Feed/FinishInput.
115   nsCString mInput;
116   // Numerical index pointing at the current cursor position.  We don't keep
117   // direct reference to the string buffer since the buffer gets often
118   // reallocated.
119   nsCString::index_type mInputCursor;
120   // Refernce to the consumer function.
121   Consumer mConsumer;
122 };
123 
124 }  // namespace mozilla
125 
126 #endif
127