1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "Tokenizer.h"
8 
9 #include "nsUnicharUtils.h"
10 #include <algorithm>
11 
12 namespace mozilla {
13 
14 static const char sWhitespaces[] = " \t";
15 
Tokenizer(const nsACString & aSource,const char * aWhitespaces,const char * aAdditionalWordChars)16 Tokenizer::Tokenizer(const nsACString& aSource,
17                      const char* aWhitespaces,
18                      const char* aAdditionalWordChars)
19   : TokenizerBase(aWhitespaces, aAdditionalWordChars)
20 {
21   mInputFinished = true;
22   aSource.BeginReading(mCursor);
23   mRecord = mRollback = mCursor;
24   aSource.EndReading(mEnd);
25 }
26 
Tokenizer(const char * aSource,const char * aWhitespaces,const char * aAdditionalWordChars)27 Tokenizer::Tokenizer(const char* aSource,
28                      const char* aWhitespaces,
29                      const char* aAdditionalWordChars)
30   : Tokenizer(nsDependentCString(aSource), aWhitespaces, aAdditionalWordChars)
31 {
32 }
33 
34 bool
Next(Token & aToken)35 Tokenizer::Next(Token& aToken)
36 {
37   if (!HasInput()) {
38     mHasFailed = true;
39     return false;
40   }
41 
42   mRollback = mCursor;
43   mCursor = Parse(aToken);
44 
45   AssignFragment(aToken, mRollback, mCursor);
46 
47   mPastEof = aToken.Type() == TOKEN_EOF;
48   mHasFailed = false;
49   return true;
50 }
51 
52 bool
Check(const TokenType aTokenType,Token & aResult)53 Tokenizer::Check(const TokenType aTokenType, Token& aResult)
54 {
55   if (!HasInput()) {
56     mHasFailed = true;
57     return false;
58   }
59 
60   nsACString::const_char_iterator next = Parse(aResult);
61   if (aTokenType != aResult.Type()) {
62     mHasFailed = true;
63     return false;
64   }
65 
66   mRollback = mCursor;
67   mCursor = next;
68 
69   AssignFragment(aResult, mRollback, mCursor);
70 
71   mPastEof = aResult.Type() == TOKEN_EOF;
72   mHasFailed = false;
73   return true;
74 }
75 
76 bool
Check(const Token & aToken)77 Tokenizer::Check(const Token& aToken)
78 {
79   if (!HasInput()) {
80     mHasFailed = true;
81     return false;
82   }
83 
84   Token parsed;
85   nsACString::const_char_iterator next = Parse(parsed);
86   if (!aToken.Equals(parsed)) {
87     mHasFailed = true;
88     return false;
89   }
90 
91   mRollback = mCursor;
92   mCursor = next;
93   mPastEof = parsed.Type() == TOKEN_EOF;
94   mHasFailed = false;
95   return true;
96 }
97 
98 void
SkipWhites(WhiteSkipping aIncludeNewLines)99 Tokenizer::SkipWhites(WhiteSkipping aIncludeNewLines)
100 {
101   if (!CheckWhite() && (aIncludeNewLines == DONT_INCLUDE_NEW_LINE || !CheckEOL())) {
102     return;
103   }
104 
105   nsACString::const_char_iterator rollback = mRollback;
106   while (CheckWhite() || (aIncludeNewLines == INCLUDE_NEW_LINE && CheckEOL())) {
107   }
108 
109   mHasFailed = false;
110   mRollback = rollback;
111 }
112 
113 void
SkipUntil(Token const & aToken)114 Tokenizer::SkipUntil(Token const& aToken)
115 {
116   nsACString::const_char_iterator rollback = mCursor;
117   const Token eof = Token::EndOfFile();
118 
119   Token t;
120   while (Next(t)) {
121     if (aToken.Equals(t) || eof.Equals(t)) {
122       Rollback();
123       break;
124     }
125   }
126 
127   mRollback = rollback;
128 }
129 
130 bool
CheckChar(bool (* aClassifier)(const char aChar))131 Tokenizer::CheckChar(bool (*aClassifier)(const char aChar))
132 {
133   if (!aClassifier) {
134     MOZ_ASSERT(false);
135     return false;
136   }
137 
138   if (!HasInput() || mCursor == mEnd) {
139     mHasFailed = true;
140     return false;
141   }
142 
143   if (!aClassifier(*mCursor)) {
144     mHasFailed = true;
145     return false;
146   }
147 
148   mRollback = mCursor;
149   ++mCursor;
150   mHasFailed = false;
151   return true;
152 }
153 
154 bool
ReadChar(char * aValue)155 Tokenizer::ReadChar(char* aValue)
156 {
157   MOZ_RELEASE_ASSERT(aValue);
158 
159   Token t;
160   if (!Check(TOKEN_CHAR, t)) {
161     return false;
162   }
163 
164   *aValue = t.AsChar();
165   return true;
166 }
167 
168 bool
ReadChar(bool (* aClassifier)(const char aChar),char * aValue)169 Tokenizer::ReadChar(bool (*aClassifier)(const char aChar), char* aValue)
170 {
171   MOZ_RELEASE_ASSERT(aValue);
172 
173   if (!CheckChar(aClassifier)) {
174     return false;
175   }
176 
177   *aValue = *mRollback;
178   return true;
179 }
180 
181 bool
ReadWord(nsACString & aValue)182 Tokenizer::ReadWord(nsACString& aValue)
183 {
184   Token t;
185   if (!Check(TOKEN_WORD, t)) {
186     return false;
187   }
188 
189   aValue.Assign(t.AsString());
190   return true;
191 }
192 
193 bool
ReadWord(nsDependentCSubstring & aValue)194 Tokenizer::ReadWord(nsDependentCSubstring& aValue)
195 {
196   Token t;
197   if (!Check(TOKEN_WORD, t)) {
198     return false;
199   }
200 
201   aValue.Rebind(t.AsString().BeginReading(), t.AsString().Length());
202   return true;
203 }
204 
205 bool
ReadUntil(Token const & aToken,nsACString & aResult,ClaimInclusion aInclude)206 Tokenizer::ReadUntil(Token const& aToken, nsACString& aResult, ClaimInclusion aInclude)
207 {
208   nsDependentCSubstring substring;
209   bool rv = ReadUntil(aToken, substring, aInclude);
210   aResult.Assign(substring);
211   return rv;
212 }
213 
214 bool
ReadUntil(Token const & aToken,nsDependentCSubstring & aResult,ClaimInclusion aInclude)215 Tokenizer::ReadUntil(Token const& aToken, nsDependentCSubstring& aResult, ClaimInclusion aInclude)
216 {
217   Record();
218   nsACString::const_char_iterator rollback = mCursor;
219 
220   bool found = false;
221   Token t;
222   while (Next(t)) {
223     if (aToken.Equals(t)) {
224       found = true;
225       break;
226     }
227   }
228 
229   Claim(aResult, aInclude);
230   mRollback = rollback;
231   return found;
232 }
233 
234 void
Rollback()235 Tokenizer::Rollback()
236 {
237   MOZ_ASSERT(mCursor > mRollback || mPastEof,
238              "Tokenizer::Rollback() cannot use twice or before any parsing");
239 
240   mPastEof = false;
241   mHasFailed = false;
242   mCursor = mRollback;
243 }
244 
245 void
Record(ClaimInclusion aInclude)246 Tokenizer::Record(ClaimInclusion aInclude)
247 {
248   mRecord = aInclude == INCLUDE_LAST
249     ? mRollback
250     : mCursor;
251 }
252 
253 void
Claim(nsACString & aResult,ClaimInclusion aInclusion)254 Tokenizer::Claim(nsACString& aResult, ClaimInclusion aInclusion)
255 {
256   nsACString::const_char_iterator close = aInclusion == EXCLUDE_LAST
257     ? mRollback
258     : mCursor;
259   aResult.Assign(Substring(mRecord, close));
260 }
261 
262 void
Claim(nsDependentCSubstring & aResult,ClaimInclusion aInclusion)263 Tokenizer::Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclusion)
264 {
265   nsACString::const_char_iterator close = aInclusion == EXCLUDE_LAST
266     ? mRollback
267     : mCursor;
268   aResult.Rebind(mRecord, close - mRecord);
269 }
270 
271 // TokenizerBase
272 
TokenizerBase(const char * aWhitespaces,const char * aAdditionalWordChars)273 TokenizerBase::TokenizerBase(const char* aWhitespaces,
274                              const char* aAdditionalWordChars)
275   : mPastEof(false)
276   , mHasFailed(false)
277   , mInputFinished(true)
278   , mMode(Mode::FULL)
279   , mMinRawDelivery(1024)
280   , mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces)
281   , mAdditionalWordChars(aAdditionalWordChars)
282   , mCursor(nullptr)
283   , mEnd(nullptr)
284   , mNextCustomTokenID(TOKEN_CUSTOM0)
285 {
286 }
287 
288 TokenizerBase::Token
AddCustomToken(const nsACString & aValue,ECaseSensitivity aCaseInsensitivity,bool aEnabled)289 TokenizerBase::AddCustomToken(const nsACString & aValue,
290                               ECaseSensitivity aCaseInsensitivity, bool aEnabled)
291 {
292   MOZ_ASSERT(!aValue.IsEmpty());
293 
294   UniquePtr<Token>& t = *mCustomTokens.AppendElement();
295   t = MakeUnique<Token>();
296 
297   t->mType = static_cast<TokenType>(++mNextCustomTokenID);
298   t->mCustomCaseInsensitivity = aCaseInsensitivity;
299   t->mCustomEnabled = aEnabled;
300   t->mCustom.Assign(aValue);
301   return *t;
302 }
303 
304 void
RemoveCustomToken(Token & aToken)305 TokenizerBase::RemoveCustomToken(Token& aToken)
306 {
307   if (aToken.mType == TOKEN_UNKNOWN) {
308     // Already removed
309     return;
310   }
311 
312   for (UniquePtr<Token> const& custom : mCustomTokens) {
313     if (custom->mType == aToken.mType) {
314       mCustomTokens.RemoveElement(custom);
315       aToken.mType = TOKEN_UNKNOWN;
316       return;
317     }
318   }
319 
320   MOZ_ASSERT(false, "Token to remove not found");
321 }
322 
323 void
EnableCustomToken(Token const & aToken,bool aEnabled)324 TokenizerBase::EnableCustomToken(Token const& aToken, bool aEnabled)
325 {
326   if (aToken.mType == TOKEN_UNKNOWN) {
327     // Already removed
328     return;
329   }
330 
331   for (UniquePtr<Token> const& custom : mCustomTokens) {
332     if (custom->Type() == aToken.Type()) {
333       // This effectively destroys the token instance.
334       custom->mCustomEnabled = aEnabled;
335       return;
336     }
337   }
338 
339   MOZ_ASSERT(false, "Token to change not found");
340 }
341 
342 void
SetTokenizingMode(Mode aMode)343 TokenizerBase::SetTokenizingMode(Mode aMode)
344 {
345   mMode = aMode;
346 }
347 
348 bool
HasFailed() const349 TokenizerBase::HasFailed() const
350 {
351   return mHasFailed;
352 }
353 
354 bool
HasInput() const355 TokenizerBase::HasInput() const
356 {
357   return !mPastEof;
358 }
359 
360 nsACString::const_char_iterator
Parse(Token & aToken) const361 TokenizerBase::Parse(Token& aToken) const
362 {
363   if (mCursor == mEnd) {
364     if (!mInputFinished) {
365       return mCursor;
366     }
367 
368     aToken = Token::EndOfFile();
369     return mEnd;
370   }
371 
372   nsACString::size_type available = mEnd - mCursor;
373 
374   uint32_t longestCustom = 0;
375   for (UniquePtr<Token> const& custom : mCustomTokens) {
376     if (IsCustom(mCursor, *custom, &longestCustom)) {
377       aToken = *custom;
378       return mCursor + custom->mCustom.Length();
379     }
380   }
381 
382   if (!mInputFinished && available < longestCustom) {
383     // Not enough data to deterministically decide.
384     return mCursor;
385   }
386 
387   nsACString::const_char_iterator next = mCursor;
388 
389   if (mMode == Mode::CUSTOM_ONLY) {
390     // We have to do a brute-force search for all of the enabled custom
391     // tokens.
392     while (next < mEnd) {
393       ++next;
394       for (UniquePtr<Token> const& custom : mCustomTokens) {
395         if (IsCustom(next, *custom)) {
396           aToken = Token::Raw();
397           return next;
398         }
399       }
400     }
401 
402     if (mInputFinished) {
403       // End of the data reached.
404       aToken = Token::Raw();
405       return next;
406     }
407 
408     if (longestCustom < available && available > mMinRawDelivery) {
409       // We can return some data w/o waiting for either a custom token
410       // or call to FinishData() when we leave the tail where all the
411       // custom tokens potentially fit, so we can't lose only partially
412       // delivered tokens.  This preserves reasonable granularity.
413       aToken = Token::Raw();
414       return mEnd - longestCustom + 1;
415     }
416 
417     // Not enough data to deterministically decide.
418     return mCursor;
419   }
420 
421   enum State {
422     PARSE_INTEGER,
423     PARSE_WORD,
424     PARSE_CRLF,
425     PARSE_LF,
426     PARSE_WS,
427     PARSE_CHAR,
428   } state;
429 
430   if (IsWordFirst(*next)) {
431     state = PARSE_WORD;
432   } else if (IsNumber(*next)) {
433     state = PARSE_INTEGER;
434   } else if (strchr(mWhitespaces, *next)) { // not UTF-8 friendly?
435     state = PARSE_WS;
436   } else if (*next == '\r') {
437     state = PARSE_CRLF;
438   } else if (*next == '\n') {
439     state = PARSE_LF;
440   } else {
441     state = PARSE_CHAR;
442   }
443 
444   mozilla::CheckedUint64 resultingNumber = 0;
445 
446   while (next < mEnd) {
447     switch (state) {
448     case PARSE_INTEGER:
449       // Keep it simple for now
450       resultingNumber *= 10;
451       resultingNumber += static_cast<uint64_t>(*next - '0');
452 
453       ++next;
454       if (IsPending(next)) {
455         break;
456       }
457       if (IsEnd(next) || !IsNumber(*next)) {
458         if (!resultingNumber.isValid()) {
459           aToken = Token::Error();
460         } else {
461           aToken = Token::Number(resultingNumber.value());
462         }
463         return next;
464       }
465       break;
466 
467     case PARSE_WORD:
468       ++next;
469       if (IsPending(next)) {
470         break;
471       }
472       if (IsEnd(next) || !IsWord(*next)) {
473         aToken = Token::Word(Substring(mCursor, next));
474         return next;
475       }
476       break;
477 
478     case PARSE_CRLF:
479       ++next;
480       if (IsPending(next)) {
481         break;
482       }
483       if (!IsEnd(next) && *next == '\n') { // LF is optional
484         ++next;
485       }
486       aToken = Token::NewLine();
487       return next;
488 
489     case PARSE_LF:
490       ++next;
491       aToken = Token::NewLine();
492       return next;
493 
494     case PARSE_WS:
495       ++next;
496       aToken = Token::Whitespace();
497       return next;
498 
499     case PARSE_CHAR:
500       ++next;
501       aToken = Token::Char(*mCursor);
502       return next;
503     } // switch (state)
504   } // while (next < end)
505 
506   MOZ_ASSERT(!mInputFinished);
507   return mCursor;
508 }
509 
510 bool
IsEnd(const nsACString::const_char_iterator & caret) const511 TokenizerBase::IsEnd(const nsACString::const_char_iterator& caret) const
512 {
513   return caret == mEnd;
514 }
515 
516 bool
IsPending(const nsACString::const_char_iterator & caret) const517 TokenizerBase::IsPending(const nsACString::const_char_iterator& caret) const
518 {
519   return IsEnd(caret) && !mInputFinished;
520 }
521 
522 bool
IsWordFirst(const char aInput) const523 TokenizerBase::IsWordFirst(const char aInput) const
524 {
525   // TODO: make this fully work with unicode
526   return (ToLowerCase(static_cast<uint32_t>(aInput)) !=
527           ToUpperCase(static_cast<uint32_t>(aInput))) ||
528           '_' == aInput ||
529           (mAdditionalWordChars ? !!strchr(mAdditionalWordChars, aInput) : false);
530 }
531 
532 bool
IsWord(const char aInput) const533 TokenizerBase::IsWord(const char aInput) const
534 {
535   return IsWordFirst(aInput) || IsNumber(aInput);
536 }
537 
538 bool
IsNumber(const char aInput) const539 TokenizerBase::IsNumber(const char aInput) const
540 {
541   // TODO: are there unicode numbers?
542   return aInput >= '0' && aInput <= '9';
543 }
544 
545 bool
IsCustom(const nsACString::const_char_iterator & caret,const Token & aCustomToken,uint32_t * aLongest) const546 TokenizerBase::IsCustom(const nsACString::const_char_iterator & caret,
547                         const Token & aCustomToken,
548                         uint32_t * aLongest) const
549 {
550   MOZ_ASSERT(aCustomToken.mType > TOKEN_CUSTOM0);
551   if (!aCustomToken.mCustomEnabled) {
552     return false;
553   }
554 
555   if (aLongest) {
556     *aLongest = std::max(*aLongest, aCustomToken.mCustom.Length());
557   }
558 
559   uint32_t inputLength = mEnd - caret;
560   if (aCustomToken.mCustom.Length() > inputLength) {
561     return false;
562   }
563 
564   nsDependentCSubstring inputFragment(caret, aCustomToken.mCustom.Length());
565   if (aCustomToken.mCustomCaseInsensitivity == CASE_INSENSITIVE) {
566     return inputFragment.Equals(aCustomToken.mCustom, nsCaseInsensitiveUTF8StringComparator());
567   }
568   return inputFragment.Equals(aCustomToken.mCustom);
569 }
570 
AssignFragment(Token & aToken,nsACString::const_char_iterator begin,nsACString::const_char_iterator end)571 void TokenizerBase::AssignFragment(Token& aToken,
572                                    nsACString::const_char_iterator begin,
573                                    nsACString::const_char_iterator end)
574 {
575   aToken.AssignFragment(begin, end);
576 }
577 
578 // TokenizerBase::Token
579 
Token()580 TokenizerBase::Token::Token()
581   : mType(TOKEN_UNKNOWN)
582   , mChar(0)
583   , mInteger(0)
584   , mCustomCaseInsensitivity(CASE_SENSITIVE)
585   , mCustomEnabled(false)
586 {
587 }
588 
Token(const Token & aOther)589 TokenizerBase::Token::Token(const Token& aOther)
590   : mType(aOther.mType)
591   , mCustom(aOther.mCustom)
592   , mChar(aOther.mChar)
593   , mInteger(aOther.mInteger)
594   , mCustomCaseInsensitivity(aOther.mCustomCaseInsensitivity)
595   , mCustomEnabled(aOther.mCustomEnabled)
596 {
597   if (mType == TOKEN_WORD || mType > TOKEN_CUSTOM0) {
598     mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
599   }
600 }
601 
602 TokenizerBase::Token&
operator =(const Token & aOther)603 TokenizerBase::Token::operator=(const Token& aOther)
604 {
605   mType = aOther.mType;
606   mCustom = aOther.mCustom;
607   mChar = aOther.mChar;
608   mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
609   mInteger = aOther.mInteger;
610   mCustomCaseInsensitivity = aOther.mCustomCaseInsensitivity;
611   mCustomEnabled = aOther.mCustomEnabled;
612   return *this;
613 }
614 
615 void
AssignFragment(nsACString::const_char_iterator begin,nsACString::const_char_iterator end)616 TokenizerBase::Token::AssignFragment(nsACString::const_char_iterator begin,
617                                      nsACString::const_char_iterator end)
618 {
619   mFragment.Rebind(begin, end - begin);
620 }
621 
622 // static
623 TokenizerBase::Token
Raw()624 TokenizerBase::Token::Raw()
625 {
626   Token t;
627   t.mType = TOKEN_RAW;
628   return t;
629 }
630 
631 // static
632 TokenizerBase::Token
Word(const nsACString & aValue)633 TokenizerBase::Token::Word(const nsACString& aValue)
634 {
635   Token t;
636   t.mType = TOKEN_WORD;
637   t.mWord.Rebind(aValue.BeginReading(), aValue.Length());
638   return t;
639 }
640 
641 // static
642 TokenizerBase::Token
Char(const char aValue)643 TokenizerBase::Token::Char(const char aValue)
644 {
645   Token t;
646   t.mType = TOKEN_CHAR;
647   t.mChar = aValue;
648   return t;
649 }
650 
651 // static
652 TokenizerBase::Token
Number(const uint64_t aValue)653 TokenizerBase::Token::Number(const uint64_t aValue)
654 {
655   Token t;
656   t.mType = TOKEN_INTEGER;
657   t.mInteger = aValue;
658   return t;
659 }
660 
661 // static
662 TokenizerBase::Token
Whitespace()663 TokenizerBase::Token::Whitespace()
664 {
665   Token t;
666   t.mType = TOKEN_WS;
667   t.mChar = '\0';
668   return t;
669 }
670 
671 // static
672 TokenizerBase::Token
NewLine()673 TokenizerBase::Token::NewLine()
674 {
675   Token t;
676   t.mType = TOKEN_EOL;
677   return t;
678 }
679 
680 // static
681 TokenizerBase::Token
EndOfFile()682 TokenizerBase::Token::EndOfFile()
683 {
684   Token t;
685   t.mType = TOKEN_EOF;
686   return t;
687 }
688 
689 // static
690 TokenizerBase::Token
Error()691 TokenizerBase::Token::Error()
692 {
693   Token t;
694   t.mType = TOKEN_ERROR;
695   return t;
696 }
697 
698 bool
Equals(const Token & aOther) const699 TokenizerBase::Token::Equals(const Token& aOther) const
700 {
701   if (mType != aOther.mType) {
702     return false;
703   }
704 
705   switch (mType) {
706   case TOKEN_INTEGER:
707     return AsInteger() == aOther.AsInteger();
708   case TOKEN_WORD:
709     return AsString() == aOther.AsString();
710   case TOKEN_CHAR:
711     return AsChar() == aOther.AsChar();
712   default:
713     return true;
714   }
715 }
716 
717 char
AsChar() const718 TokenizerBase::Token::AsChar() const
719 {
720   MOZ_ASSERT(mType == TOKEN_CHAR || mType == TOKEN_WS);
721   return mChar;
722 }
723 
724 nsDependentCSubstring
AsString() const725 TokenizerBase::Token::AsString() const
726 {
727   MOZ_ASSERT(mType == TOKEN_WORD);
728   return mWord;
729 }
730 
731 uint64_t
AsInteger() const732 TokenizerBase::Token::AsInteger() const
733 {
734   MOZ_ASSERT(mType == TOKEN_INTEGER);
735   return mInteger;
736 }
737 
738 } // mozilla
739