1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "Tokenizer.h"
8
9 #include "nsUnicharUtils.h"
10 #include <algorithm>
11
12 namespace mozilla {
13
14 static const char sWhitespaces[] = " \t";
15
Tokenizer(const nsACString & aSource,const char * aWhitespaces,const char * aAdditionalWordChars)16 Tokenizer::Tokenizer(const nsACString& aSource,
17 const char* aWhitespaces,
18 const char* aAdditionalWordChars)
19 : TokenizerBase(aWhitespaces, aAdditionalWordChars)
20 {
21 mInputFinished = true;
22 aSource.BeginReading(mCursor);
23 mRecord = mRollback = mCursor;
24 aSource.EndReading(mEnd);
25 }
26
Tokenizer(const char * aSource,const char * aWhitespaces,const char * aAdditionalWordChars)27 Tokenizer::Tokenizer(const char* aSource,
28 const char* aWhitespaces,
29 const char* aAdditionalWordChars)
30 : Tokenizer(nsDependentCString(aSource), aWhitespaces, aAdditionalWordChars)
31 {
32 }
33
34 bool
Next(Token & aToken)35 Tokenizer::Next(Token& aToken)
36 {
37 if (!HasInput()) {
38 mHasFailed = true;
39 return false;
40 }
41
42 mRollback = mCursor;
43 mCursor = Parse(aToken);
44
45 AssignFragment(aToken, mRollback, mCursor);
46
47 mPastEof = aToken.Type() == TOKEN_EOF;
48 mHasFailed = false;
49 return true;
50 }
51
52 bool
Check(const TokenType aTokenType,Token & aResult)53 Tokenizer::Check(const TokenType aTokenType, Token& aResult)
54 {
55 if (!HasInput()) {
56 mHasFailed = true;
57 return false;
58 }
59
60 nsACString::const_char_iterator next = Parse(aResult);
61 if (aTokenType != aResult.Type()) {
62 mHasFailed = true;
63 return false;
64 }
65
66 mRollback = mCursor;
67 mCursor = next;
68
69 AssignFragment(aResult, mRollback, mCursor);
70
71 mPastEof = aResult.Type() == TOKEN_EOF;
72 mHasFailed = false;
73 return true;
74 }
75
76 bool
Check(const Token & aToken)77 Tokenizer::Check(const Token& aToken)
78 {
79 if (!HasInput()) {
80 mHasFailed = true;
81 return false;
82 }
83
84 Token parsed;
85 nsACString::const_char_iterator next = Parse(parsed);
86 if (!aToken.Equals(parsed)) {
87 mHasFailed = true;
88 return false;
89 }
90
91 mRollback = mCursor;
92 mCursor = next;
93 mPastEof = parsed.Type() == TOKEN_EOF;
94 mHasFailed = false;
95 return true;
96 }
97
98 void
SkipWhites(WhiteSkipping aIncludeNewLines)99 Tokenizer::SkipWhites(WhiteSkipping aIncludeNewLines)
100 {
101 if (!CheckWhite() && (aIncludeNewLines == DONT_INCLUDE_NEW_LINE || !CheckEOL())) {
102 return;
103 }
104
105 nsACString::const_char_iterator rollback = mRollback;
106 while (CheckWhite() || (aIncludeNewLines == INCLUDE_NEW_LINE && CheckEOL())) {
107 }
108
109 mHasFailed = false;
110 mRollback = rollback;
111 }
112
113 void
SkipUntil(Token const & aToken)114 Tokenizer::SkipUntil(Token const& aToken)
115 {
116 nsACString::const_char_iterator rollback = mCursor;
117 const Token eof = Token::EndOfFile();
118
119 Token t;
120 while (Next(t)) {
121 if (aToken.Equals(t) || eof.Equals(t)) {
122 Rollback();
123 break;
124 }
125 }
126
127 mRollback = rollback;
128 }
129
130 bool
CheckChar(bool (* aClassifier)(const char aChar))131 Tokenizer::CheckChar(bool (*aClassifier)(const char aChar))
132 {
133 if (!aClassifier) {
134 MOZ_ASSERT(false);
135 return false;
136 }
137
138 if (!HasInput() || mCursor == mEnd) {
139 mHasFailed = true;
140 return false;
141 }
142
143 if (!aClassifier(*mCursor)) {
144 mHasFailed = true;
145 return false;
146 }
147
148 mRollback = mCursor;
149 ++mCursor;
150 mHasFailed = false;
151 return true;
152 }
153
154 bool
ReadChar(char * aValue)155 Tokenizer::ReadChar(char* aValue)
156 {
157 MOZ_RELEASE_ASSERT(aValue);
158
159 Token t;
160 if (!Check(TOKEN_CHAR, t)) {
161 return false;
162 }
163
164 *aValue = t.AsChar();
165 return true;
166 }
167
168 bool
ReadChar(bool (* aClassifier)(const char aChar),char * aValue)169 Tokenizer::ReadChar(bool (*aClassifier)(const char aChar), char* aValue)
170 {
171 MOZ_RELEASE_ASSERT(aValue);
172
173 if (!CheckChar(aClassifier)) {
174 return false;
175 }
176
177 *aValue = *mRollback;
178 return true;
179 }
180
181 bool
ReadWord(nsACString & aValue)182 Tokenizer::ReadWord(nsACString& aValue)
183 {
184 Token t;
185 if (!Check(TOKEN_WORD, t)) {
186 return false;
187 }
188
189 aValue.Assign(t.AsString());
190 return true;
191 }
192
193 bool
ReadWord(nsDependentCSubstring & aValue)194 Tokenizer::ReadWord(nsDependentCSubstring& aValue)
195 {
196 Token t;
197 if (!Check(TOKEN_WORD, t)) {
198 return false;
199 }
200
201 aValue.Rebind(t.AsString().BeginReading(), t.AsString().Length());
202 return true;
203 }
204
205 bool
ReadUntil(Token const & aToken,nsACString & aResult,ClaimInclusion aInclude)206 Tokenizer::ReadUntil(Token const& aToken, nsACString& aResult, ClaimInclusion aInclude)
207 {
208 nsDependentCSubstring substring;
209 bool rv = ReadUntil(aToken, substring, aInclude);
210 aResult.Assign(substring);
211 return rv;
212 }
213
214 bool
ReadUntil(Token const & aToken,nsDependentCSubstring & aResult,ClaimInclusion aInclude)215 Tokenizer::ReadUntil(Token const& aToken, nsDependentCSubstring& aResult, ClaimInclusion aInclude)
216 {
217 Record();
218 nsACString::const_char_iterator rollback = mCursor;
219
220 bool found = false;
221 Token t;
222 while (Next(t)) {
223 if (aToken.Equals(t)) {
224 found = true;
225 break;
226 }
227 }
228
229 Claim(aResult, aInclude);
230 mRollback = rollback;
231 return found;
232 }
233
234 void
Rollback()235 Tokenizer::Rollback()
236 {
237 MOZ_ASSERT(mCursor > mRollback || mPastEof,
238 "Tokenizer::Rollback() cannot use twice or before any parsing");
239
240 mPastEof = false;
241 mHasFailed = false;
242 mCursor = mRollback;
243 }
244
245 void
Record(ClaimInclusion aInclude)246 Tokenizer::Record(ClaimInclusion aInclude)
247 {
248 mRecord = aInclude == INCLUDE_LAST
249 ? mRollback
250 : mCursor;
251 }
252
253 void
Claim(nsACString & aResult,ClaimInclusion aInclusion)254 Tokenizer::Claim(nsACString& aResult, ClaimInclusion aInclusion)
255 {
256 nsACString::const_char_iterator close = aInclusion == EXCLUDE_LAST
257 ? mRollback
258 : mCursor;
259 aResult.Assign(Substring(mRecord, close));
260 }
261
262 void
Claim(nsDependentCSubstring & aResult,ClaimInclusion aInclusion)263 Tokenizer::Claim(nsDependentCSubstring& aResult, ClaimInclusion aInclusion)
264 {
265 nsACString::const_char_iterator close = aInclusion == EXCLUDE_LAST
266 ? mRollback
267 : mCursor;
268 aResult.Rebind(mRecord, close - mRecord);
269 }
270
271 // TokenizerBase
272
TokenizerBase(const char * aWhitespaces,const char * aAdditionalWordChars)273 TokenizerBase::TokenizerBase(const char* aWhitespaces,
274 const char* aAdditionalWordChars)
275 : mPastEof(false)
276 , mHasFailed(false)
277 , mInputFinished(true)
278 , mMode(Mode::FULL)
279 , mMinRawDelivery(1024)
280 , mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces)
281 , mAdditionalWordChars(aAdditionalWordChars)
282 , mCursor(nullptr)
283 , mEnd(nullptr)
284 , mNextCustomTokenID(TOKEN_CUSTOM0)
285 {
286 }
287
288 TokenizerBase::Token
AddCustomToken(const nsACString & aValue,ECaseSensitivity aCaseInsensitivity,bool aEnabled)289 TokenizerBase::AddCustomToken(const nsACString & aValue,
290 ECaseSensitivity aCaseInsensitivity, bool aEnabled)
291 {
292 MOZ_ASSERT(!aValue.IsEmpty());
293
294 UniquePtr<Token>& t = *mCustomTokens.AppendElement();
295 t = MakeUnique<Token>();
296
297 t->mType = static_cast<TokenType>(++mNextCustomTokenID);
298 t->mCustomCaseInsensitivity = aCaseInsensitivity;
299 t->mCustomEnabled = aEnabled;
300 t->mCustom.Assign(aValue);
301 return *t;
302 }
303
304 void
RemoveCustomToken(Token & aToken)305 TokenizerBase::RemoveCustomToken(Token& aToken)
306 {
307 if (aToken.mType == TOKEN_UNKNOWN) {
308 // Already removed
309 return;
310 }
311
312 for (UniquePtr<Token> const& custom : mCustomTokens) {
313 if (custom->mType == aToken.mType) {
314 mCustomTokens.RemoveElement(custom);
315 aToken.mType = TOKEN_UNKNOWN;
316 return;
317 }
318 }
319
320 MOZ_ASSERT(false, "Token to remove not found");
321 }
322
323 void
EnableCustomToken(Token const & aToken,bool aEnabled)324 TokenizerBase::EnableCustomToken(Token const& aToken, bool aEnabled)
325 {
326 if (aToken.mType == TOKEN_UNKNOWN) {
327 // Already removed
328 return;
329 }
330
331 for (UniquePtr<Token> const& custom : mCustomTokens) {
332 if (custom->Type() == aToken.Type()) {
333 // This effectively destroys the token instance.
334 custom->mCustomEnabled = aEnabled;
335 return;
336 }
337 }
338
339 MOZ_ASSERT(false, "Token to change not found");
340 }
341
342 void
SetTokenizingMode(Mode aMode)343 TokenizerBase::SetTokenizingMode(Mode aMode)
344 {
345 mMode = aMode;
346 }
347
348 bool
HasFailed() const349 TokenizerBase::HasFailed() const
350 {
351 return mHasFailed;
352 }
353
354 bool
HasInput() const355 TokenizerBase::HasInput() const
356 {
357 return !mPastEof;
358 }
359
360 nsACString::const_char_iterator
Parse(Token & aToken) const361 TokenizerBase::Parse(Token& aToken) const
362 {
363 if (mCursor == mEnd) {
364 if (!mInputFinished) {
365 return mCursor;
366 }
367
368 aToken = Token::EndOfFile();
369 return mEnd;
370 }
371
372 nsACString::size_type available = mEnd - mCursor;
373
374 uint32_t longestCustom = 0;
375 for (UniquePtr<Token> const& custom : mCustomTokens) {
376 if (IsCustom(mCursor, *custom, &longestCustom)) {
377 aToken = *custom;
378 return mCursor + custom->mCustom.Length();
379 }
380 }
381
382 if (!mInputFinished && available < longestCustom) {
383 // Not enough data to deterministically decide.
384 return mCursor;
385 }
386
387 nsACString::const_char_iterator next = mCursor;
388
389 if (mMode == Mode::CUSTOM_ONLY) {
390 // We have to do a brute-force search for all of the enabled custom
391 // tokens.
392 while (next < mEnd) {
393 ++next;
394 for (UniquePtr<Token> const& custom : mCustomTokens) {
395 if (IsCustom(next, *custom)) {
396 aToken = Token::Raw();
397 return next;
398 }
399 }
400 }
401
402 if (mInputFinished) {
403 // End of the data reached.
404 aToken = Token::Raw();
405 return next;
406 }
407
408 if (longestCustom < available && available > mMinRawDelivery) {
409 // We can return some data w/o waiting for either a custom token
410 // or call to FinishData() when we leave the tail where all the
411 // custom tokens potentially fit, so we can't lose only partially
412 // delivered tokens. This preserves reasonable granularity.
413 aToken = Token::Raw();
414 return mEnd - longestCustom + 1;
415 }
416
417 // Not enough data to deterministically decide.
418 return mCursor;
419 }
420
421 enum State {
422 PARSE_INTEGER,
423 PARSE_WORD,
424 PARSE_CRLF,
425 PARSE_LF,
426 PARSE_WS,
427 PARSE_CHAR,
428 } state;
429
430 if (IsWordFirst(*next)) {
431 state = PARSE_WORD;
432 } else if (IsNumber(*next)) {
433 state = PARSE_INTEGER;
434 } else if (strchr(mWhitespaces, *next)) { // not UTF-8 friendly?
435 state = PARSE_WS;
436 } else if (*next == '\r') {
437 state = PARSE_CRLF;
438 } else if (*next == '\n') {
439 state = PARSE_LF;
440 } else {
441 state = PARSE_CHAR;
442 }
443
444 mozilla::CheckedUint64 resultingNumber = 0;
445
446 while (next < mEnd) {
447 switch (state) {
448 case PARSE_INTEGER:
449 // Keep it simple for now
450 resultingNumber *= 10;
451 resultingNumber += static_cast<uint64_t>(*next - '0');
452
453 ++next;
454 if (IsPending(next)) {
455 break;
456 }
457 if (IsEnd(next) || !IsNumber(*next)) {
458 if (!resultingNumber.isValid()) {
459 aToken = Token::Error();
460 } else {
461 aToken = Token::Number(resultingNumber.value());
462 }
463 return next;
464 }
465 break;
466
467 case PARSE_WORD:
468 ++next;
469 if (IsPending(next)) {
470 break;
471 }
472 if (IsEnd(next) || !IsWord(*next)) {
473 aToken = Token::Word(Substring(mCursor, next));
474 return next;
475 }
476 break;
477
478 case PARSE_CRLF:
479 ++next;
480 if (IsPending(next)) {
481 break;
482 }
483 if (!IsEnd(next) && *next == '\n') { // LF is optional
484 ++next;
485 }
486 aToken = Token::NewLine();
487 return next;
488
489 case PARSE_LF:
490 ++next;
491 aToken = Token::NewLine();
492 return next;
493
494 case PARSE_WS:
495 ++next;
496 aToken = Token::Whitespace();
497 return next;
498
499 case PARSE_CHAR:
500 ++next;
501 aToken = Token::Char(*mCursor);
502 return next;
503 } // switch (state)
504 } // while (next < end)
505
506 MOZ_ASSERT(!mInputFinished);
507 return mCursor;
508 }
509
510 bool
IsEnd(const nsACString::const_char_iterator & caret) const511 TokenizerBase::IsEnd(const nsACString::const_char_iterator& caret) const
512 {
513 return caret == mEnd;
514 }
515
516 bool
IsPending(const nsACString::const_char_iterator & caret) const517 TokenizerBase::IsPending(const nsACString::const_char_iterator& caret) const
518 {
519 return IsEnd(caret) && !mInputFinished;
520 }
521
522 bool
IsWordFirst(const char aInput) const523 TokenizerBase::IsWordFirst(const char aInput) const
524 {
525 // TODO: make this fully work with unicode
526 return (ToLowerCase(static_cast<uint32_t>(aInput)) !=
527 ToUpperCase(static_cast<uint32_t>(aInput))) ||
528 '_' == aInput ||
529 (mAdditionalWordChars ? !!strchr(mAdditionalWordChars, aInput) : false);
530 }
531
532 bool
IsWord(const char aInput) const533 TokenizerBase::IsWord(const char aInput) const
534 {
535 return IsWordFirst(aInput) || IsNumber(aInput);
536 }
537
538 bool
IsNumber(const char aInput) const539 TokenizerBase::IsNumber(const char aInput) const
540 {
541 // TODO: are there unicode numbers?
542 return aInput >= '0' && aInput <= '9';
543 }
544
545 bool
IsCustom(const nsACString::const_char_iterator & caret,const Token & aCustomToken,uint32_t * aLongest) const546 TokenizerBase::IsCustom(const nsACString::const_char_iterator & caret,
547 const Token & aCustomToken,
548 uint32_t * aLongest) const
549 {
550 MOZ_ASSERT(aCustomToken.mType > TOKEN_CUSTOM0);
551 if (!aCustomToken.mCustomEnabled) {
552 return false;
553 }
554
555 if (aLongest) {
556 *aLongest = std::max(*aLongest, aCustomToken.mCustom.Length());
557 }
558
559 uint32_t inputLength = mEnd - caret;
560 if (aCustomToken.mCustom.Length() > inputLength) {
561 return false;
562 }
563
564 nsDependentCSubstring inputFragment(caret, aCustomToken.mCustom.Length());
565 if (aCustomToken.mCustomCaseInsensitivity == CASE_INSENSITIVE) {
566 return inputFragment.Equals(aCustomToken.mCustom, nsCaseInsensitiveUTF8StringComparator());
567 }
568 return inputFragment.Equals(aCustomToken.mCustom);
569 }
570
AssignFragment(Token & aToken,nsACString::const_char_iterator begin,nsACString::const_char_iterator end)571 void TokenizerBase::AssignFragment(Token& aToken,
572 nsACString::const_char_iterator begin,
573 nsACString::const_char_iterator end)
574 {
575 aToken.AssignFragment(begin, end);
576 }
577
578 // TokenizerBase::Token
579
Token()580 TokenizerBase::Token::Token()
581 : mType(TOKEN_UNKNOWN)
582 , mChar(0)
583 , mInteger(0)
584 , mCustomCaseInsensitivity(CASE_SENSITIVE)
585 , mCustomEnabled(false)
586 {
587 }
588
Token(const Token & aOther)589 TokenizerBase::Token::Token(const Token& aOther)
590 : mType(aOther.mType)
591 , mCustom(aOther.mCustom)
592 , mChar(aOther.mChar)
593 , mInteger(aOther.mInteger)
594 , mCustomCaseInsensitivity(aOther.mCustomCaseInsensitivity)
595 , mCustomEnabled(aOther.mCustomEnabled)
596 {
597 if (mType == TOKEN_WORD || mType > TOKEN_CUSTOM0) {
598 mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
599 }
600 }
601
602 TokenizerBase::Token&
operator =(const Token & aOther)603 TokenizerBase::Token::operator=(const Token& aOther)
604 {
605 mType = aOther.mType;
606 mCustom = aOther.mCustom;
607 mChar = aOther.mChar;
608 mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
609 mInteger = aOther.mInteger;
610 mCustomCaseInsensitivity = aOther.mCustomCaseInsensitivity;
611 mCustomEnabled = aOther.mCustomEnabled;
612 return *this;
613 }
614
615 void
AssignFragment(nsACString::const_char_iterator begin,nsACString::const_char_iterator end)616 TokenizerBase::Token::AssignFragment(nsACString::const_char_iterator begin,
617 nsACString::const_char_iterator end)
618 {
619 mFragment.Rebind(begin, end - begin);
620 }
621
622 // static
623 TokenizerBase::Token
Raw()624 TokenizerBase::Token::Raw()
625 {
626 Token t;
627 t.mType = TOKEN_RAW;
628 return t;
629 }
630
631 // static
632 TokenizerBase::Token
Word(const nsACString & aValue)633 TokenizerBase::Token::Word(const nsACString& aValue)
634 {
635 Token t;
636 t.mType = TOKEN_WORD;
637 t.mWord.Rebind(aValue.BeginReading(), aValue.Length());
638 return t;
639 }
640
641 // static
642 TokenizerBase::Token
Char(const char aValue)643 TokenizerBase::Token::Char(const char aValue)
644 {
645 Token t;
646 t.mType = TOKEN_CHAR;
647 t.mChar = aValue;
648 return t;
649 }
650
651 // static
652 TokenizerBase::Token
Number(const uint64_t aValue)653 TokenizerBase::Token::Number(const uint64_t aValue)
654 {
655 Token t;
656 t.mType = TOKEN_INTEGER;
657 t.mInteger = aValue;
658 return t;
659 }
660
661 // static
662 TokenizerBase::Token
Whitespace()663 TokenizerBase::Token::Whitespace()
664 {
665 Token t;
666 t.mType = TOKEN_WS;
667 t.mChar = '\0';
668 return t;
669 }
670
671 // static
672 TokenizerBase::Token
NewLine()673 TokenizerBase::Token::NewLine()
674 {
675 Token t;
676 t.mType = TOKEN_EOL;
677 return t;
678 }
679
680 // static
681 TokenizerBase::Token
EndOfFile()682 TokenizerBase::Token::EndOfFile()
683 {
684 Token t;
685 t.mType = TOKEN_EOF;
686 return t;
687 }
688
689 // static
690 TokenizerBase::Token
Error()691 TokenizerBase::Token::Error()
692 {
693 Token t;
694 t.mType = TOKEN_ERROR;
695 return t;
696 }
697
698 bool
Equals(const Token & aOther) const699 TokenizerBase::Token::Equals(const Token& aOther) const
700 {
701 if (mType != aOther.mType) {
702 return false;
703 }
704
705 switch (mType) {
706 case TOKEN_INTEGER:
707 return AsInteger() == aOther.AsInteger();
708 case TOKEN_WORD:
709 return AsString() == aOther.AsString();
710 case TOKEN_CHAR:
711 return AsChar() == aOther.AsChar();
712 default:
713 return true;
714 }
715 }
716
717 char
AsChar() const718 TokenizerBase::Token::AsChar() const
719 {
720 MOZ_ASSERT(mType == TOKEN_CHAR || mType == TOKEN_WS);
721 return mChar;
722 }
723
724 nsDependentCSubstring
AsString() const725 TokenizerBase::Token::AsString() const
726 {
727 MOZ_ASSERT(mType == TOKEN_WORD);
728 return mWord;
729 }
730
731 uint64_t
AsInteger() const732 TokenizerBase::Token::AsInteger() const
733 {
734 MOZ_ASSERT(mType == TOKEN_INTEGER);
735 return mInteger;
736 }
737
738 } // mozilla
739