1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 /* tokenization of CSS style sheets */ 8 9 #ifndef nsCSSScanner_h___ 10 #define nsCSSScanner_h___ 11 12 #include "nsString.h" 13 14 namespace mozilla { 15 namespace css { 16 class ErrorReporter; 17 } // namespace css 18 } // namespace mozilla 19 20 // Token types; in close but not perfect correspondence to the token 21 // categorization in section 4.1.1 of CSS2.1. (The deviations are all 22 // the fault of css3-selectors, which has requirements that can only be 23 // met by changing the generic tokenization.) The comment on each line 24 // illustrates the form of each identifier. 25 26 enum nsCSSTokenType { 27 // White space of any kind. No value fields are used. Note that 28 // comments do *not* count as white space; comments separate tokens 29 // but are not themselves tokens. 30 eCSSToken_Whitespace, // 31 // A comment. 32 eCSSToken_Comment, // /*...*/ 33 34 // Identifier-like tokens. mIdent is the text of the identifier. 35 // The difference between ID and Hash is: if the text after the # 36 // would have been a valid Ident if the # hadn't been there, the 37 // scanner produces an ID token. Otherwise it produces a Hash token. 38 // (This distinction is required by css3-selectors.) 39 eCSSToken_Ident, // word 40 eCSSToken_Function, // word( 41 eCSSToken_AtKeyword, // @word 42 eCSSToken_ID, // #word 43 eCSSToken_Hash, // #0word 44 45 // Numeric tokens. mNumber is the floating-point value of the 46 // number, and mHasSign indicates whether there was an explicit sign 47 // (+ or -) in front of the number. If mIntegerValid is true, the 48 // number had the lexical form of an integer, and mInteger is its 49 // integer value. Lexically integer values outside the range of a 50 // 32-bit signed number are clamped to the maximum values; mNumber 51 // will indicate a 'truer' value in that case. Percentage tokens 52 // are always considered not to be integers, even if their numeric 53 // value is integral (100% => mNumber = 1.0). For Dimension 54 // tokens, mIdent holds the text of the unit. 55 eCSSToken_Number, // 1 -5 +2e3 3.14159 7.297352e-3 56 eCSSToken_Dimension, // 24px 8.5in 57 eCSSToken_Percentage, // 85% 1280.4% 58 59 // String-like tokens. In all cases, mIdent holds the text 60 // belonging to the string, and mSymbol holds the delimiter 61 // character, which may be ', ", or zero (only for unquoted URLs). 62 // Bad_String and Bad_URL tokens are emitted when the closing 63 // delimiter or parenthesis was missing. 64 eCSSToken_String, // 'foo bar' "foo bar" 65 eCSSToken_Bad_String, // 'foo bar 66 eCSSToken_URL, // url(foobar) url("foo bar") 67 eCSSToken_Bad_URL, // url(foo 68 69 // Any one-character symbol. mSymbol holds the character. 70 eCSSToken_Symbol, // . ; { } ! * 71 72 // Match operators. These are single tokens rather than pairs of 73 // Symbol tokens because css3-selectors forbids the presence of 74 // comments between the two characters. No value fields are used; 75 // the token type indicates which operator. 76 eCSSToken_Includes, // ~= 77 eCSSToken_Dashmatch, // |= 78 eCSSToken_Beginsmatch, // ^= 79 eCSSToken_Endsmatch, // $= 80 eCSSToken_Containsmatch, // *= 81 82 // Unicode-range token: currently used only in @font-face. 83 // The lexical rule for this token includes several forms that are 84 // semantically invalid. Therefore, mIdent always holds the 85 // complete original text of the token (so we can print it 86 // accurately in diagnostics), and mIntegerValid is true iff the 87 // token is semantically valid. In that case, mInteger holds the 88 // lowest value included in the range, and mInteger2 holds the 89 // highest value included in the range. 90 eCSSToken_URange, // U+007e U+01?? U+2000-206F 91 92 // HTML comment delimiters, ignored as a unit when they appear at 93 // the top level of a style sheet, for compatibility with websites 94 // written for compatibility with pre-CSS browsers. This token type 95 // subsumes the css2.1 CDO and CDC tokens, which are always treated 96 // the same by the parser. mIdent holds the text of the token, for 97 // diagnostics. 98 eCSSToken_HTMLComment, // <!-- --> 99 }; 100 101 // Classification of tokens used to determine if a "/**/" string must be 102 // inserted if pasting token streams together when serializing. We include 103 // values corresponding to eCSSToken_Dashmatch and eCSSToken_Containsmatch, 104 // as css-syntax does not treat these as whole tokens, but we will still 105 // need to insert a "/**/" string between a '|' delim and a '|=' dashmatch 106 // and between a '/' delim and a '*=' containsmatch. 107 // 108 // https://drafts.csswg.org/css-syntax/#serialization 109 enum nsCSSTokenSerializationType { 110 eCSSTokenSerialization_Nothing, 111 eCSSTokenSerialization_Whitespace, 112 eCSSTokenSerialization_AtKeyword_or_Hash, 113 eCSSTokenSerialization_Number, 114 eCSSTokenSerialization_Dimension, 115 eCSSTokenSerialization_Percentage, 116 eCSSTokenSerialization_URange, 117 eCSSTokenSerialization_URL_or_BadURL, 118 eCSSTokenSerialization_Function, 119 eCSSTokenSerialization_Ident, 120 eCSSTokenSerialization_CDC, 121 eCSSTokenSerialization_DashMatch, 122 eCSSTokenSerialization_ContainsMatch, 123 eCSSTokenSerialization_Symbol_Hash, // '#' 124 eCSSTokenSerialization_Symbol_At, // '@' 125 eCSSTokenSerialization_Symbol_Dot_or_Plus, // '.', '+' 126 eCSSTokenSerialization_Symbol_Minus, // '-' 127 eCSSTokenSerialization_Symbol_OpenParen, // '(' 128 eCSSTokenSerialization_Symbol_Question, // '?' 129 eCSSTokenSerialization_Symbol_Assorted, // '$', '^', '~' 130 eCSSTokenSerialization_Symbol_Equals, // '=' 131 eCSSTokenSerialization_Symbol_Bar, // '|' 132 eCSSTokenSerialization_Symbol_Slash, // '/' 133 eCSSTokenSerialization_Symbol_Asterisk, // '*' 134 eCSSTokenSerialization_Other // anything else 135 }; 136 137 // A single token returned from the scanner. mType is always 138 // meaningful; comments above describe which other fields are 139 // meaningful for which token types. 140 struct nsCSSToken { 141 nsAutoString mIdent; 142 float mNumber; 143 int32_t mInteger; 144 int32_t mInteger2; 145 nsCSSTokenType mType; 146 char16_t mSymbol; 147 bool mIntegerValid; 148 bool mHasSign; 149 nsCSSTokennsCSSToken150 nsCSSToken() 151 : mNumber(0), 152 mInteger(0), 153 mInteger2(0), 154 mType(eCSSToken_Whitespace), 155 mSymbol('\0'), 156 mIntegerValid(false), 157 mHasSign(false) {} 158 IsSymbolnsCSSToken159 bool IsSymbol(char16_t aSymbol) const { 160 return mType == eCSSToken_Symbol && mSymbol == aSymbol; 161 } 162 163 void AppendToString(nsString& aBuffer) const; 164 }; 165 166 // Represents an nsCSSScanner's saved position in the input buffer. 167 class nsCSSScannerPosition { 168 friend class nsCSSScanner; 169 170 public: nsCSSScannerPosition()171 nsCSSScannerPosition() : mInitialized(false) {} 172 LineNumber()173 uint32_t LineNumber() { 174 MOZ_ASSERT(mInitialized); 175 return mLineNumber; 176 } 177 LineOffset()178 uint32_t LineOffset() { 179 MOZ_ASSERT(mInitialized); 180 return mLineOffset; 181 } 182 183 private: 184 uint32_t mOffset; 185 uint32_t mLineNumber; 186 uint32_t mLineOffset; 187 uint32_t mTokenLineNumber; 188 uint32_t mTokenLineOffset; 189 uint32_t mTokenOffset; 190 bool mInitialized; 191 }; 192 193 enum nsCSSScannerExclude { 194 // Return all tokens, including whitespace and comments. 195 eCSSScannerExclude_None, 196 // Include whitespace but exclude comments. 197 eCSSScannerExclude_Comments, 198 // Exclude whitespace and comments. 199 eCSSScannerExclude_WhitespaceAndComments 200 }; 201 202 // nsCSSScanner tokenizes an input stream using the CSS2.1 forward 203 // compatible tokenization rules. Used internally by nsCSSParser; 204 // not available for use by other code. 205 class nsCSSScanner { 206 public: 207 // |aLineNumber == 1| is the beginning of a file, use |aLineNumber == 0| 208 // when the line number is unknown. The scanner does not take 209 // ownership of |aBuffer|, so the caller must be sure to keep it 210 // alive for the lifetime of the scanner. 211 nsCSSScanner(const nsAString& aBuffer, uint32_t aLineNumber); 212 ~nsCSSScanner(); 213 SetErrorReporter(mozilla::css::ErrorReporter * aReporter)214 void SetErrorReporter(mozilla::css::ErrorReporter* aReporter) { 215 mReporter = aReporter; 216 } 217 218 // Reset or check whether a BAD_URL or BAD_STRING token has been seen. ClearSeenBadToken()219 void ClearSeenBadToken() { mSeenBadToken = false; } SeenBadToken()220 bool SeenBadToken() const { return mSeenBadToken; } 221 222 // Reset or check whether a "var(" FUNCTION token has been seen. ClearSeenVariableReference()223 void ClearSeenVariableReference() { mSeenVariableReference = false; } SeenVariableReference()224 bool SeenVariableReference() const { return mSeenVariableReference; } 225 226 // Get the 1-based line number of the last character of 227 // the most recently processed token. GetLineNumber()228 uint32_t GetLineNumber() const { return mTokenLineNumber; } 229 230 // Get the 0-based column number of the first character of 231 // the most recently processed token. GetColumnNumber()232 uint32_t GetColumnNumber() const { return mTokenOffset - mTokenLineOffset; } 233 GetTokenOffset()234 uint32_t GetTokenOffset() const { return mTokenOffset; } 235 GetTokenEndOffset()236 uint32_t GetTokenEndOffset() const { return mOffset; } 237 GetSourceMapURL()238 const nsAString& GetSourceMapURL() const { return mSourceMapURL; } 239 GetSourceURL()240 const nsAString& GetSourceURL() const { return mSourceURL; } 241 242 // Get the text of the line containing the first character of 243 // the most recently processed token. 244 nsDependentSubstring GetCurrentLine() const; 245 246 // Get the next token. Return false on EOF. aTokenResult is filled 247 // in with the data for the token. aSkip controls whether 248 // whitespace and/or comment tokens are ever returned. 249 bool Next(nsCSSToken& aTokenResult, nsCSSScannerExclude aSkip); 250 251 // Get the body of an URL token (everything after the 'url('). 252 // This is exposed for use by nsCSSParser::ParseMozDocumentRule, 253 // which, for historical reasons, must make additional function 254 // tokens behave like url(). Please do not add new uses to the 255 // parser. 256 void NextURL(nsCSSToken& aTokenResult); 257 258 // This is exposed for use by nsCSSParser::ParsePseudoClassWithNthPairArg, 259 // because "2n-1" is a single DIMENSION token, and "n-1" is a single 260 // IDENT token, but the :nth() selector syntax wants to interpret 261 // them the same as "2n -1" and "n -1" respectively. Please do not 262 // add new uses to the parser. 263 // 264 // Note: this function may not be used to back up over a line boundary. 265 void Backup(uint32_t n); 266 267 // Starts recording the input stream from the current position. 268 void StartRecording(); 269 270 // Abandons recording of the input stream. 271 void StopRecording(); 272 273 // Stops recording of the input stream and appends the recorded 274 // input to aBuffer. 275 void StopRecording(nsString& aBuffer); 276 277 // Returns the length of the current recording. 278 uint32_t RecordingLength() const; 279 280 #ifdef DEBUG 281 bool IsRecording() const; 282 #endif 283 284 // Stores the current scanner offset into the specified object. 285 void SavePosition(nsCSSScannerPosition& aState); 286 287 // Resets the scanner offset to a position saved by SavePosition. 288 void RestoreSavedPosition(const nsCSSScannerPosition& aState); 289 290 enum EOFCharacters { 291 eEOFCharacters_None = 0x0000, 292 293 // to handle \<EOF> inside strings 294 eEOFCharacters_DropBackslash = 0x0001, 295 296 // to handle \<EOF> outside strings 297 eEOFCharacters_ReplacementChar = 0x0002, 298 299 // to close comments 300 eEOFCharacters_Asterisk = 0x0004, 301 eEOFCharacters_Slash = 0x0008, 302 303 // to close double-quoted strings 304 eEOFCharacters_DoubleQuote = 0x0010, 305 306 // to close single-quoted strings 307 eEOFCharacters_SingleQuote = 0x0020, 308 309 // to close URLs 310 eEOFCharacters_CloseParen = 0x0040, 311 }; 312 313 // Appends any characters to the specified string the input stream to make the 314 // last token not rely on special EOF handling behavior. 315 // 316 // If eEOFCharacters_DropBackslash is in aEOFCharacters, it is ignored. 317 static void AppendImpliedEOFCharacters(EOFCharacters aEOFCharacters, 318 nsAString& aString); 319 GetEOFCharacters()320 EOFCharacters GetEOFCharacters() const { 321 #ifdef DEBUG 322 AssertEOFCharactersValid(mEOFCharacters); 323 #endif 324 return mEOFCharacters; 325 } 326 327 #ifdef DEBUG 328 static void AssertEOFCharactersValid(uint32_t c); 329 #endif 330 331 protected: 332 int32_t Peek(uint32_t n = 0); 333 void Advance(uint32_t n = 1); 334 void AdvanceLine(); 335 336 void SkipWhitespace(); 337 bool CheckCommentDirective(const nsAString& aDirective); 338 void SkipComment(); 339 340 bool GatherEscape(nsString& aOutput, bool aInString); 341 bool GatherText(uint8_t aClass, nsString& aIdent); 342 343 bool ScanIdent(nsCSSToken& aResult); 344 bool ScanAtKeyword(nsCSSToken& aResult); 345 bool ScanHash(nsCSSToken& aResult); 346 bool ScanNumber(nsCSSToken& aResult); 347 bool ScanString(nsCSSToken& aResult); 348 bool ScanURange(nsCSSToken& aResult); 349 350 void SetEOFCharacters(uint32_t aEOFCharacters); 351 void AddEOFCharacters(uint32_t aEOFCharacters); 352 353 const char16_t* mBuffer; 354 uint32_t mOffset; 355 uint32_t mCount; 356 357 uint32_t mLineNumber; 358 uint32_t mLineOffset; 359 360 uint32_t mTokenLineNumber; 361 uint32_t mTokenLineOffset; 362 uint32_t mTokenOffset; 363 364 uint32_t mRecordStartOffset; 365 EOFCharacters mEOFCharacters; 366 367 mozilla::css::ErrorReporter* mReporter; 368 369 bool mRecording; 370 bool mSeenBadToken; 371 bool mSeenVariableReference; 372 373 nsString mSourceMapURL; 374 nsString mSourceURL; 375 }; 376 377 // Token for the grid-template-areas micro-syntax 378 // http://dev.w3.org/csswg/css-grid/#propdef-grid-template-areas 379 struct MOZ_STACK_CLASS nsCSSGridTemplateAreaToken { 380 nsAutoString mName; // Empty for a null cell, non-empty for a named cell 381 bool isTrash; // True for a trash token, mName is ignored in this case. 382 }; 383 384 // Scanner for the grid-template-areas micro-syntax 385 class nsCSSGridTemplateAreaScanner { 386 public: 387 explicit nsCSSGridTemplateAreaScanner(const nsAString& aBuffer); 388 389 // Get the next token. Return false on EOF. 390 // aTokenResult is filled in with the data for the token. 391 bool Next(nsCSSGridTemplateAreaToken& aTokenResult); 392 393 private: 394 const char16_t* mBuffer; 395 uint32_t mOffset; 396 uint32_t mCount; 397 }; 398 399 #endif /* nsCSSScanner_h___ */ 400