1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 /* tokenization of CSS style sheets */
8 
9 #ifndef nsCSSScanner_h___
10 #define nsCSSScanner_h___
11 
12 #include "nsString.h"
13 
14 namespace mozilla {
15 namespace css {
16 class ErrorReporter;
17 }  // namespace css
18 }  // namespace mozilla
19 
20 // Token types; in close but not perfect correspondence to the token
21 // categorization in section 4.1.1 of CSS2.1.  (The deviations are all
22 // the fault of css3-selectors, which has requirements that can only be
23 // met by changing the generic tokenization.)  The comment on each line
24 // illustrates the form of each identifier.
25 
26 enum nsCSSTokenType {
27   // White space of any kind.  No value fields are used.  Note that
28   // comments do *not* count as white space; comments separate tokens
29   // but are not themselves tokens.
30   eCSSToken_Whitespace,  //
31   // A comment.
32   eCSSToken_Comment,  // /*...*/
33 
34   // Identifier-like tokens.  mIdent is the text of the identifier.
35   // The difference between ID and Hash is: if the text after the #
36   // would have been a valid Ident if the # hadn't been there, the
37   // scanner produces an ID token.  Otherwise it produces a Hash token.
38   // (This distinction is required by css3-selectors.)
39   eCSSToken_Ident,      // word
40   eCSSToken_Function,   // word(
41   eCSSToken_AtKeyword,  // @word
42   eCSSToken_ID,         // #word
43   eCSSToken_Hash,       // #0word
44 
45   // Numeric tokens.  mNumber is the floating-point value of the
46   // number, and mHasSign indicates whether there was an explicit sign
47   // (+ or -) in front of the number.  If mIntegerValid is true, the
48   // number had the lexical form of an integer, and mInteger is its
49   // integer value.  Lexically integer values outside the range of a
50   // 32-bit signed number are clamped to the maximum values; mNumber
51   // will indicate a 'truer' value in that case.  Percentage tokens
52   // are always considered not to be integers, even if their numeric
53   // value is integral (100% => mNumber = 1.0).  For Dimension
54   // tokens, mIdent holds the text of the unit.
55   eCSSToken_Number,      // 1 -5 +2e3 3.14159 7.297352e-3
56   eCSSToken_Dimension,   // 24px 8.5in
57   eCSSToken_Percentage,  // 85% 1280.4%
58 
59   // String-like tokens.  In all cases, mIdent holds the text
60   // belonging to the string, and mSymbol holds the delimiter
61   // character, which may be ', ", or zero (only for unquoted URLs).
62   // Bad_String and Bad_URL tokens are emitted when the closing
63   // delimiter or parenthesis was missing.
64   eCSSToken_String,      // 'foo bar' "foo bar"
65   eCSSToken_Bad_String,  // 'foo bar
66   eCSSToken_URL,         // url(foobar) url("foo bar")
67   eCSSToken_Bad_URL,     // url(foo
68 
69   // Any one-character symbol.  mSymbol holds the character.
70   eCSSToken_Symbol,  // . ; { } ! *
71 
72   // Match operators.  These are single tokens rather than pairs of
73   // Symbol tokens because css3-selectors forbids the presence of
74   // comments between the two characters.  No value fields are used;
75   // the token type indicates which operator.
76   eCSSToken_Includes,       // ~=
77   eCSSToken_Dashmatch,      // |=
78   eCSSToken_Beginsmatch,    // ^=
79   eCSSToken_Endsmatch,      // $=
80   eCSSToken_Containsmatch,  // *=
81 
82   // Unicode-range token: currently used only in @font-face.
83   // The lexical rule for this token includes several forms that are
84   // semantically invalid.  Therefore, mIdent always holds the
85   // complete original text of the token (so we can print it
86   // accurately in diagnostics), and mIntegerValid is true iff the
87   // token is semantically valid.  In that case, mInteger holds the
88   // lowest value included in the range, and mInteger2 holds the
89   // highest value included in the range.
90   eCSSToken_URange,  // U+007e U+01?? U+2000-206F
91 
92   // HTML comment delimiters, ignored as a unit when they appear at
93   // the top level of a style sheet, for compatibility with websites
94   // written for compatibility with pre-CSS browsers.  This token type
95   // subsumes the css2.1 CDO and CDC tokens, which are always treated
96   // the same by the parser.  mIdent holds the text of the token, for
97   // diagnostics.
98   eCSSToken_HTMLComment,  // <!-- -->
99 };
100 
101 // Classification of tokens used to determine if a "/**/" string must be
102 // inserted if pasting token streams together when serializing.  We include
103 // values corresponding to eCSSToken_Dashmatch and eCSSToken_Containsmatch,
104 // as css-syntax does not treat these as whole tokens, but we will still
105 // need to insert a "/**/" string between a '|' delim and a '|=' dashmatch
106 // and between a '/' delim and a '*=' containsmatch.
107 //
108 // https://drafts.csswg.org/css-syntax/#serialization
109 enum nsCSSTokenSerializationType {
110   eCSSTokenSerialization_Nothing,
111   eCSSTokenSerialization_Whitespace,
112   eCSSTokenSerialization_AtKeyword_or_Hash,
113   eCSSTokenSerialization_Number,
114   eCSSTokenSerialization_Dimension,
115   eCSSTokenSerialization_Percentage,
116   eCSSTokenSerialization_URange,
117   eCSSTokenSerialization_URL_or_BadURL,
118   eCSSTokenSerialization_Function,
119   eCSSTokenSerialization_Ident,
120   eCSSTokenSerialization_CDC,
121   eCSSTokenSerialization_DashMatch,
122   eCSSTokenSerialization_ContainsMatch,
123   eCSSTokenSerialization_Symbol_Hash,         // '#'
124   eCSSTokenSerialization_Symbol_At,           // '@'
125   eCSSTokenSerialization_Symbol_Dot_or_Plus,  // '.', '+'
126   eCSSTokenSerialization_Symbol_Minus,        // '-'
127   eCSSTokenSerialization_Symbol_OpenParen,    // '('
128   eCSSTokenSerialization_Symbol_Question,     // '?'
129   eCSSTokenSerialization_Symbol_Assorted,     // '$', '^', '~'
130   eCSSTokenSerialization_Symbol_Equals,       // '='
131   eCSSTokenSerialization_Symbol_Bar,          // '|'
132   eCSSTokenSerialization_Symbol_Slash,        // '/'
133   eCSSTokenSerialization_Symbol_Asterisk,     // '*'
134   eCSSTokenSerialization_Other                // anything else
135 };
136 
137 // A single token returned from the scanner.  mType is always
138 // meaningful; comments above describe which other fields are
139 // meaningful for which token types.
140 struct nsCSSToken {
141   nsAutoString mIdent;
142   float mNumber;
143   int32_t mInteger;
144   int32_t mInteger2;
145   nsCSSTokenType mType;
146   char16_t mSymbol;
147   bool mIntegerValid;
148   bool mHasSign;
149 
nsCSSTokennsCSSToken150   nsCSSToken()
151       : mNumber(0),
152         mInteger(0),
153         mInteger2(0),
154         mType(eCSSToken_Whitespace),
155         mSymbol('\0'),
156         mIntegerValid(false),
157         mHasSign(false) {}
158 
IsSymbolnsCSSToken159   bool IsSymbol(char16_t aSymbol) const {
160     return mType == eCSSToken_Symbol && mSymbol == aSymbol;
161   }
162 
163   void AppendToString(nsString& aBuffer) const;
164 };
165 
166 // Represents an nsCSSScanner's saved position in the input buffer.
167 class nsCSSScannerPosition {
168   friend class nsCSSScanner;
169 
170  public:
nsCSSScannerPosition()171   nsCSSScannerPosition() : mInitialized(false) {}
172 
LineNumber()173   uint32_t LineNumber() {
174     MOZ_ASSERT(mInitialized);
175     return mLineNumber;
176   }
177 
LineOffset()178   uint32_t LineOffset() {
179     MOZ_ASSERT(mInitialized);
180     return mLineOffset;
181   }
182 
183  private:
184   uint32_t mOffset;
185   uint32_t mLineNumber;
186   uint32_t mLineOffset;
187   uint32_t mTokenLineNumber;
188   uint32_t mTokenLineOffset;
189   uint32_t mTokenOffset;
190   bool mInitialized;
191 };
192 
193 enum nsCSSScannerExclude {
194   // Return all tokens, including whitespace and comments.
195   eCSSScannerExclude_None,
196   // Include whitespace but exclude comments.
197   eCSSScannerExclude_Comments,
198   // Exclude whitespace and comments.
199   eCSSScannerExclude_WhitespaceAndComments
200 };
201 
202 // nsCSSScanner tokenizes an input stream using the CSS2.1 forward
203 // compatible tokenization rules.  Used internally by nsCSSParser;
204 // not available for use by other code.
205 class nsCSSScanner {
206  public:
207   // |aLineNumber == 1| is the beginning of a file, use |aLineNumber == 0|
208   // when the line number is unknown.  The scanner does not take
209   // ownership of |aBuffer|, so the caller must be sure to keep it
210   // alive for the lifetime of the scanner.
211   nsCSSScanner(const nsAString& aBuffer, uint32_t aLineNumber);
212   ~nsCSSScanner();
213 
SetErrorReporter(mozilla::css::ErrorReporter * aReporter)214   void SetErrorReporter(mozilla::css::ErrorReporter* aReporter) {
215     mReporter = aReporter;
216   }
217 
218   // Reset or check whether a BAD_URL or BAD_STRING token has been seen.
ClearSeenBadToken()219   void ClearSeenBadToken() { mSeenBadToken = false; }
SeenBadToken()220   bool SeenBadToken() const { return mSeenBadToken; }
221 
222   // Reset or check whether a "var(" FUNCTION token has been seen.
ClearSeenVariableReference()223   void ClearSeenVariableReference() { mSeenVariableReference = false; }
SeenVariableReference()224   bool SeenVariableReference() const { return mSeenVariableReference; }
225 
226   // Get the 1-based line number of the last character of
227   // the most recently processed token.
GetLineNumber()228   uint32_t GetLineNumber() const { return mTokenLineNumber; }
229 
230   // Get the 0-based column number of the first character of
231   // the most recently processed token.
GetColumnNumber()232   uint32_t GetColumnNumber() const { return mTokenOffset - mTokenLineOffset; }
233 
GetTokenOffset()234   uint32_t GetTokenOffset() const { return mTokenOffset; }
235 
GetTokenEndOffset()236   uint32_t GetTokenEndOffset() const { return mOffset; }
237 
GetSourceMapURL()238   const nsAString& GetSourceMapURL() const { return mSourceMapURL; }
239 
GetSourceURL()240   const nsAString& GetSourceURL() const { return mSourceURL; }
241 
242   // Get the text of the line containing the first character of
243   // the most recently processed token.
244   nsDependentSubstring GetCurrentLine() const;
245 
246   // Get the next token.  Return false on EOF.  aTokenResult is filled
247   // in with the data for the token.  aSkip controls whether
248   // whitespace and/or comment tokens are ever returned.
249   bool Next(nsCSSToken& aTokenResult, nsCSSScannerExclude aSkip);
250 
251   // Get the body of an URL token (everything after the 'url(').
252   // This is exposed for use by nsCSSParser::ParseMozDocumentRule,
253   // which, for historical reasons, must make additional function
254   // tokens behave like url().  Please do not add new uses to the
255   // parser.
256   void NextURL(nsCSSToken& aTokenResult);
257 
258   // This is exposed for use by nsCSSParser::ParsePseudoClassWithNthPairArg,
259   // because "2n-1" is a single DIMENSION token, and "n-1" is a single
260   // IDENT token, but the :nth() selector syntax wants to interpret
261   // them the same as "2n -1" and "n -1" respectively.  Please do not
262   // add new uses to the parser.
263   //
264   // Note: this function may not be used to back up over a line boundary.
265   void Backup(uint32_t n);
266 
267   // Starts recording the input stream from the current position.
268   void StartRecording();
269 
270   // Abandons recording of the input stream.
271   void StopRecording();
272 
273   // Stops recording of the input stream and appends the recorded
274   // input to aBuffer.
275   void StopRecording(nsString& aBuffer);
276 
277   // Returns the length of the current recording.
278   uint32_t RecordingLength() const;
279 
280 #ifdef DEBUG
281   bool IsRecording() const;
282 #endif
283 
284   // Stores the current scanner offset into the specified object.
285   void SavePosition(nsCSSScannerPosition& aState);
286 
287   // Resets the scanner offset to a position saved by SavePosition.
288   void RestoreSavedPosition(const nsCSSScannerPosition& aState);
289 
290   enum EOFCharacters {
291     eEOFCharacters_None = 0x0000,
292 
293     // to handle \<EOF> inside strings
294     eEOFCharacters_DropBackslash = 0x0001,
295 
296     // to handle \<EOF> outside strings
297     eEOFCharacters_ReplacementChar = 0x0002,
298 
299     // to close comments
300     eEOFCharacters_Asterisk = 0x0004,
301     eEOFCharacters_Slash = 0x0008,
302 
303     // to close double-quoted strings
304     eEOFCharacters_DoubleQuote = 0x0010,
305 
306     // to close single-quoted strings
307     eEOFCharacters_SingleQuote = 0x0020,
308 
309     // to close URLs
310     eEOFCharacters_CloseParen = 0x0040,
311   };
312 
313   // Appends any characters to the specified string the input stream to make the
314   // last token not rely on special EOF handling behavior.
315   //
316   // If eEOFCharacters_DropBackslash is in aEOFCharacters, it is ignored.
317   static void AppendImpliedEOFCharacters(EOFCharacters aEOFCharacters,
318                                          nsAString& aString);
319 
GetEOFCharacters()320   EOFCharacters GetEOFCharacters() const {
321 #ifdef DEBUG
322     AssertEOFCharactersValid(mEOFCharacters);
323 #endif
324     return mEOFCharacters;
325   }
326 
327 #ifdef DEBUG
328   static void AssertEOFCharactersValid(uint32_t c);
329 #endif
330 
331  protected:
332   int32_t Peek(uint32_t n = 0);
333   void Advance(uint32_t n = 1);
334   void AdvanceLine();
335 
336   void SkipWhitespace();
337   bool CheckCommentDirective(const nsAString& aDirective);
338   void SkipComment();
339 
340   bool GatherEscape(nsString& aOutput, bool aInString);
341   bool GatherText(uint8_t aClass, nsString& aIdent);
342 
343   bool ScanIdent(nsCSSToken& aResult);
344   bool ScanAtKeyword(nsCSSToken& aResult);
345   bool ScanHash(nsCSSToken& aResult);
346   bool ScanNumber(nsCSSToken& aResult);
347   bool ScanString(nsCSSToken& aResult);
348   bool ScanURange(nsCSSToken& aResult);
349 
350   void SetEOFCharacters(uint32_t aEOFCharacters);
351   void AddEOFCharacters(uint32_t aEOFCharacters);
352 
353   const char16_t* mBuffer;
354   uint32_t mOffset;
355   uint32_t mCount;
356 
357   uint32_t mLineNumber;
358   uint32_t mLineOffset;
359 
360   uint32_t mTokenLineNumber;
361   uint32_t mTokenLineOffset;
362   uint32_t mTokenOffset;
363 
364   uint32_t mRecordStartOffset;
365   EOFCharacters mEOFCharacters;
366 
367   mozilla::css::ErrorReporter* mReporter;
368 
369   bool mRecording;
370   bool mSeenBadToken;
371   bool mSeenVariableReference;
372 
373   nsString mSourceMapURL;
374   nsString mSourceURL;
375 };
376 
377 // Token for the grid-template-areas micro-syntax
378 // http://dev.w3.org/csswg/css-grid/#propdef-grid-template-areas
379 struct MOZ_STACK_CLASS nsCSSGridTemplateAreaToken {
380   nsAutoString mName;  // Empty for a null cell, non-empty for a named cell
381   bool isTrash;        // True for a trash token, mName is ignored in this case.
382 };
383 
384 // Scanner for the grid-template-areas micro-syntax
385 class nsCSSGridTemplateAreaScanner {
386  public:
387   explicit nsCSSGridTemplateAreaScanner(const nsAString& aBuffer);
388 
389   // Get the next token.  Return false on EOF.
390   // aTokenResult is filled in with the data for the token.
391   bool Next(nsCSSGridTemplateAreaToken& aTokenResult);
392 
393  private:
394   const char16_t* mBuffer;
395   uint32_t mOffset;
396   uint32_t mCount;
397 };
398 
399 #endif /* nsCSSScanner_h___ */
400