1// Package css is a CSS3 lexer and parser following the specifications at http://www.w3.org/TR/css-syntax-3/. 2package css 3 4// TODO: \uFFFD replacement character for NULL bytes in strings for example, or atleast don't end the string early 5 6import ( 7 "bytes" 8 "io" 9 "strconv" 10 11 "github.com/tdewolff/parse/v2" 12) 13 14// TokenType determines the type of token, eg. a number or a semicolon. 15type TokenType uint32 16 17// TokenType values. 18const ( 19 ErrorToken TokenType = iota // extra token when errors occur 20 IdentToken 21 FunctionToken // rgb( rgba( ... 22 AtKeywordToken // @abc 23 HashToken // #abc 24 StringToken 25 BadStringToken 26 URLToken 27 BadURLToken 28 DelimToken // any unmatched character 29 NumberToken // 5 30 PercentageToken // 5% 31 DimensionToken // 5em 32 UnicodeRangeToken // U+554A 33 IncludeMatchToken // ~= 34 DashMatchToken // |= 35 PrefixMatchToken // ^= 36 SuffixMatchToken // $= 37 SubstringMatchToken // *= 38 ColumnToken // || 39 WhitespaceToken // space \t \r \n \f 40 CDOToken // <!-- 41 CDCToken // --> 42 ColonToken // : 43 SemicolonToken // ; 44 CommaToken // , 45 LeftBracketToken // [ 46 RightBracketToken // ] 47 LeftParenthesisToken // ( 48 RightParenthesisToken // ) 49 LeftBraceToken // { 50 RightBraceToken // } 51 CommentToken // extra token for comments 52 EmptyToken 53 CustomPropertyNameToken 54 CustomPropertyValueToken 55) 56 57// String returns the string representation of a TokenType. 58func (tt TokenType) String() string { 59 switch tt { 60 case ErrorToken: 61 return "Error" 62 case IdentToken: 63 return "Ident" 64 case FunctionToken: 65 return "Function" 66 case AtKeywordToken: 67 return "AtKeyword" 68 case HashToken: 69 return "Hash" 70 case StringToken: 71 return "String" 72 case BadStringToken: 73 return "BadString" 74 case URLToken: 75 return "URL" 76 case BadURLToken: 77 return "BadURL" 78 case DelimToken: 79 return "Delim" 80 case NumberToken: 81 return "Number" 82 case PercentageToken: 83 return "Percentage" 84 case DimensionToken: 85 return "Dimension" 86 case UnicodeRangeToken: 87 return "UnicodeRange" 88 case IncludeMatchToken: 89 return "IncludeMatch" 90 case DashMatchToken: 91 return "DashMatch" 92 case PrefixMatchToken: 93 return "PrefixMatch" 94 case SuffixMatchToken: 95 return "SuffixMatch" 96 case SubstringMatchToken: 97 return "SubstringMatch" 98 case ColumnToken: 99 return "Column" 100 case WhitespaceToken: 101 return "Whitespace" 102 case CDOToken: 103 return "CDO" 104 case CDCToken: 105 return "CDC" 106 case ColonToken: 107 return "Colon" 108 case SemicolonToken: 109 return "Semicolon" 110 case CommaToken: 111 return "Comma" 112 case LeftBracketToken: 113 return "LeftBracket" 114 case RightBracketToken: 115 return "RightBracket" 116 case LeftParenthesisToken: 117 return "LeftParenthesis" 118 case RightParenthesisToken: 119 return "RightParenthesis" 120 case LeftBraceToken: 121 return "LeftBrace" 122 case RightBraceToken: 123 return "RightBrace" 124 case CommentToken: 125 return "Comment" 126 case EmptyToken: 127 return "Empty" 128 case CustomPropertyNameToken: 129 return "CustomPropertyName" 130 case CustomPropertyValueToken: 131 return "CustomPropertyValue" 132 } 133 return "Invalid(" + strconv.Itoa(int(tt)) + ")" 134} 135 136//////////////////////////////////////////////////////////////// 137 138// Lexer is the state for the lexer. 139type Lexer struct { 140 r *parse.Input 141} 142 143// NewLexer returns a new Lexer for a given io.Reader. 144func NewLexer(r *parse.Input) *Lexer { 145 return &Lexer{ 146 r: r, 147 } 148} 149 150// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned. 151func (l *Lexer) Err() error { 152 return l.r.Err() 153} 154 155// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. 156func (l *Lexer) Next() (TokenType, []byte) { 157 switch l.r.Peek(0) { 158 case ' ', '\t', '\n', '\r', '\f': 159 l.r.Move(1) 160 for l.consumeWhitespace() { 161 } 162 return WhitespaceToken, l.r.Shift() 163 case ':': 164 l.r.Move(1) 165 return ColonToken, l.r.Shift() 166 case ';': 167 l.r.Move(1) 168 return SemicolonToken, l.r.Shift() 169 case ',': 170 l.r.Move(1) 171 return CommaToken, l.r.Shift() 172 case '(', ')', '[', ']', '{', '}': 173 if t := l.consumeBracket(); t != ErrorToken { 174 return t, l.r.Shift() 175 } 176 case '#': 177 if l.consumeHashToken() { 178 return HashToken, l.r.Shift() 179 } 180 case '"', '\'': 181 if t := l.consumeString(); t != ErrorToken { 182 return t, l.r.Shift() 183 } 184 case '.', '+': 185 if t := l.consumeNumeric(); t != ErrorToken { 186 return t, l.r.Shift() 187 } 188 case '-': 189 if t := l.consumeNumeric(); t != ErrorToken { 190 return t, l.r.Shift() 191 } else if t := l.consumeIdentlike(); t != ErrorToken { 192 return t, l.r.Shift() 193 } else if l.consumeCDCToken() { 194 return CDCToken, l.r.Shift() 195 } else if l.consumeCustomVariableToken() { 196 return CustomPropertyNameToken, l.r.Shift() 197 } 198 case '@': 199 if l.consumeAtKeywordToken() { 200 return AtKeywordToken, l.r.Shift() 201 } 202 case '$', '*', '^', '~': 203 if t := l.consumeMatch(); t != ErrorToken { 204 return t, l.r.Shift() 205 } 206 case '/': 207 if l.consumeComment() { 208 return CommentToken, l.r.Shift() 209 } 210 case '<': 211 if l.consumeCDOToken() { 212 return CDOToken, l.r.Shift() 213 } 214 case '\\': 215 if t := l.consumeIdentlike(); t != ErrorToken { 216 return t, l.r.Shift() 217 } 218 case 'u', 'U': 219 if l.consumeUnicodeRangeToken() { 220 return UnicodeRangeToken, l.r.Shift() 221 } else if t := l.consumeIdentlike(); t != ErrorToken { 222 return t, l.r.Shift() 223 } 224 case '|': 225 if t := l.consumeMatch(); t != ErrorToken { 226 return t, l.r.Shift() 227 } else if l.consumeColumnToken() { 228 return ColumnToken, l.r.Shift() 229 } 230 case 0: 231 if l.r.Err() != nil { 232 return ErrorToken, nil 233 } 234 default: 235 if t := l.consumeNumeric(); t != ErrorToken { 236 return t, l.r.Shift() 237 } else if t := l.consumeIdentlike(); t != ErrorToken { 238 return t, l.r.Shift() 239 } 240 } 241 // can't be rune because consumeIdentlike consumes that as an identifier 242 l.r.Move(1) 243 return DelimToken, l.r.Shift() 244} 245 246//////////////////////////////////////////////////////////////// 247 248/* 249The following functions follow the railroad diagrams in http://www.w3.org/TR/css3-syntax/ 250*/ 251 252func (l *Lexer) consumeByte(c byte) bool { 253 if l.r.Peek(0) == c { 254 l.r.Move(1) 255 return true 256 } 257 return false 258} 259 260func (l *Lexer) consumeComment() bool { 261 if l.r.Peek(0) != '/' || l.r.Peek(1) != '*' { 262 return false 263 } 264 l.r.Move(2) 265 for { 266 c := l.r.Peek(0) 267 if c == 0 && l.r.Err() != nil { 268 break 269 } else if c == '*' && l.r.Peek(1) == '/' { 270 l.r.Move(2) 271 return true 272 } 273 l.r.Move(1) 274 } 275 return true 276} 277 278func (l *Lexer) consumeNewline() bool { 279 c := l.r.Peek(0) 280 if c == '\n' || c == '\f' { 281 l.r.Move(1) 282 return true 283 } else if c == '\r' { 284 if l.r.Peek(1) == '\n' { 285 l.r.Move(2) 286 } else { 287 l.r.Move(1) 288 } 289 return true 290 } 291 return false 292} 293 294func (l *Lexer) consumeWhitespace() bool { 295 c := l.r.Peek(0) 296 if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { 297 l.r.Move(1) 298 return true 299 } 300 return false 301} 302 303func (l *Lexer) consumeDigit() bool { 304 c := l.r.Peek(0) 305 if c >= '0' && c <= '9' { 306 l.r.Move(1) 307 return true 308 } 309 return false 310} 311 312func (l *Lexer) consumeHexDigit() bool { 313 c := l.r.Peek(0) 314 if (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') { 315 l.r.Move(1) 316 return true 317 } 318 return false 319} 320 321func (l *Lexer) consumeEscape() bool { 322 if l.r.Peek(0) != '\\' { 323 return false 324 } 325 mark := l.r.Pos() 326 l.r.Move(1) 327 if l.consumeNewline() { 328 l.r.Rewind(mark) 329 return false 330 } else if l.consumeHexDigit() { 331 for k := 1; k < 6; k++ { 332 if !l.consumeHexDigit() { 333 break 334 } 335 } 336 l.consumeWhitespace() 337 return true 338 } else { 339 c := l.r.Peek(0) 340 if c >= 0xC0 { 341 _, n := l.r.PeekRune(0) 342 l.r.Move(n) 343 return true 344 } else if c == 0 && l.r.Err() != nil { 345 l.r.Rewind(mark) 346 return false 347 } 348 } 349 l.r.Move(1) 350 return true 351} 352 353func (l *Lexer) consumeIdentToken() bool { 354 mark := l.r.Pos() 355 if l.r.Peek(0) == '-' { 356 l.r.Move(1) 357 } 358 c := l.r.Peek(0) 359 if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80) { 360 if c != '\\' || !l.consumeEscape() { 361 l.r.Rewind(mark) 362 return false 363 } 364 } else { 365 l.r.Move(1) 366 } 367 for { 368 c := l.r.Peek(0) 369 if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) { 370 if c != '\\' || !l.consumeEscape() { 371 break 372 } 373 } else { 374 l.r.Move(1) 375 } 376 } 377 return true 378} 379 380// support custom variables, https://www.w3.org/TR/css-variables-1/ 381func (l *Lexer) consumeCustomVariableToken() bool { 382 // expect to be on a '-' 383 l.r.Move(1) 384 if l.r.Peek(0) != '-' { 385 l.r.Move(-1) 386 return false 387 } 388 if !l.consumeIdentToken() { 389 l.r.Move(-1) 390 return false 391 } 392 return true 393} 394 395func (l *Lexer) consumeAtKeywordToken() bool { 396 // expect to be on an '@' 397 l.r.Move(1) 398 if !l.consumeIdentToken() { 399 l.r.Move(-1) 400 return false 401 } 402 return true 403} 404 405func (l *Lexer) consumeHashToken() bool { 406 // expect to be on a '#' 407 mark := l.r.Pos() 408 l.r.Move(1) 409 c := l.r.Peek(0) 410 if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) { 411 if c != '\\' || !l.consumeEscape() { 412 l.r.Rewind(mark) 413 return false 414 } 415 } else { 416 l.r.Move(1) 417 } 418 for { 419 c := l.r.Peek(0) 420 if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) { 421 if c != '\\' || !l.consumeEscape() { 422 break 423 } 424 } else { 425 l.r.Move(1) 426 } 427 } 428 return true 429} 430 431func (l *Lexer) consumeNumberToken() bool { 432 mark := l.r.Pos() 433 c := l.r.Peek(0) 434 if c == '+' || c == '-' { 435 l.r.Move(1) 436 } 437 firstDigit := l.consumeDigit() 438 if firstDigit { 439 for l.consumeDigit() { 440 } 441 } 442 if l.r.Peek(0) == '.' { 443 l.r.Move(1) 444 if l.consumeDigit() { 445 for l.consumeDigit() { 446 } 447 } else if firstDigit { 448 // . could belong to the next token 449 l.r.Move(-1) 450 return true 451 } else { 452 l.r.Rewind(mark) 453 return false 454 } 455 } else if !firstDigit { 456 l.r.Rewind(mark) 457 return false 458 } 459 mark = l.r.Pos() 460 c = l.r.Peek(0) 461 if c == 'e' || c == 'E' { 462 l.r.Move(1) 463 c = l.r.Peek(0) 464 if c == '+' || c == '-' { 465 l.r.Move(1) 466 } 467 if !l.consumeDigit() { 468 // e could belong to next token 469 l.r.Rewind(mark) 470 return true 471 } 472 for l.consumeDigit() { 473 } 474 } 475 return true 476} 477 478func (l *Lexer) consumeUnicodeRangeToken() bool { 479 c := l.r.Peek(0) 480 if (c != 'u' && c != 'U') || l.r.Peek(1) != '+' { 481 return false 482 } 483 mark := l.r.Pos() 484 l.r.Move(2) 485 486 // consume up to 6 hexDigits 487 k := 0 488 for l.consumeHexDigit() { 489 k++ 490 } 491 492 // either a minus or a question mark or the end is expected 493 if l.consumeByte('-') { 494 if k == 0 || 6 < k { 495 l.r.Rewind(mark) 496 return false 497 } 498 499 // consume another up to 6 hexDigits 500 if l.consumeHexDigit() { 501 k = 1 502 for l.consumeHexDigit() { 503 k++ 504 } 505 } else { 506 l.r.Rewind(mark) 507 return false 508 } 509 } else if l.consumeByte('?') { 510 // could be filled up to 6 characters with question marks or else regular hexDigits 511 k++ 512 for l.consumeByte('?') { 513 k++ 514 } 515 } 516 if k == 0 || 6 < k { 517 l.r.Rewind(mark) 518 return false 519 } 520 return true 521} 522 523func (l *Lexer) consumeColumnToken() bool { 524 if l.r.Peek(0) == '|' && l.r.Peek(1) == '|' { 525 l.r.Move(2) 526 return true 527 } 528 return false 529} 530 531func (l *Lexer) consumeCDOToken() bool { 532 if l.r.Peek(0) == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' { 533 l.r.Move(4) 534 return true 535 } 536 return false 537} 538 539func (l *Lexer) consumeCDCToken() bool { 540 if l.r.Peek(0) == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' { 541 l.r.Move(3) 542 return true 543 } 544 return false 545} 546 547//////////////////////////////////////////////////////////////// 548 549// consumeMatch consumes any MatchToken. 550func (l *Lexer) consumeMatch() TokenType { 551 if l.r.Peek(1) == '=' { 552 switch l.r.Peek(0) { 553 case '~': 554 l.r.Move(2) 555 return IncludeMatchToken 556 case '|': 557 l.r.Move(2) 558 return DashMatchToken 559 case '^': 560 l.r.Move(2) 561 return PrefixMatchToken 562 case '$': 563 l.r.Move(2) 564 return SuffixMatchToken 565 case '*': 566 l.r.Move(2) 567 return SubstringMatchToken 568 } 569 } 570 return ErrorToken 571} 572 573// consumeBracket consumes any bracket token. 574func (l *Lexer) consumeBracket() TokenType { 575 switch l.r.Peek(0) { 576 case '(': 577 l.r.Move(1) 578 return LeftParenthesisToken 579 case ')': 580 l.r.Move(1) 581 return RightParenthesisToken 582 case '[': 583 l.r.Move(1) 584 return LeftBracketToken 585 case ']': 586 l.r.Move(1) 587 return RightBracketToken 588 case '{': 589 l.r.Move(1) 590 return LeftBraceToken 591 case '}': 592 l.r.Move(1) 593 return RightBraceToken 594 } 595 return ErrorToken 596} 597 598// consumeNumeric consumes NumberToken, PercentageToken or DimensionToken. 599func (l *Lexer) consumeNumeric() TokenType { 600 if l.consumeNumberToken() { 601 if l.consumeByte('%') { 602 return PercentageToken 603 } else if l.consumeIdentToken() { 604 return DimensionToken 605 } 606 return NumberToken 607 } 608 return ErrorToken 609} 610 611// consumeString consumes a string and may return BadStringToken when a newline is encountered. 612func (l *Lexer) consumeString() TokenType { 613 // assume to be on " or ' 614 delim := l.r.Peek(0) 615 l.r.Move(1) 616 for { 617 c := l.r.Peek(0) 618 if c == 0 && l.r.Err() != nil { 619 break 620 } else if c == '\n' || c == '\r' || c == '\f' { 621 l.r.Move(1) 622 return BadStringToken 623 } else if c == delim { 624 l.r.Move(1) 625 break 626 } else if c == '\\' { 627 if !l.consumeEscape() { 628 // either newline or EOF after backslash 629 l.r.Move(1) 630 l.consumeNewline() 631 } 632 } else { 633 l.r.Move(1) 634 } 635 } 636 return StringToken 637} 638 639func (l *Lexer) consumeUnquotedURL() bool { 640 for { 641 c := l.r.Peek(0) 642 if c == 0 && l.r.Err() != nil || c == ')' { 643 break 644 } else if c == '"' || c == '\'' || c == '(' || c == '\\' || c == ' ' || c <= 0x1F || c == 0x7F { 645 if c != '\\' || !l.consumeEscape() { 646 return false 647 } 648 } else { 649 l.r.Move(1) 650 } 651 } 652 return true 653} 654 655// consumeRemnantsBadUrl consumes bytes of a BadUrlToken so that normal tokenization may continue. 656func (l *Lexer) consumeRemnantsBadURL() { 657 for { 658 if l.consumeByte(')') || l.r.Err() != nil { 659 break 660 } else if !l.consumeEscape() { 661 l.r.Move(1) 662 } 663 } 664} 665 666// consumeIdentlike consumes IdentToken, FunctionToken or UrlToken. 667func (l *Lexer) consumeIdentlike() TokenType { 668 if l.consumeIdentToken() { 669 if l.r.Peek(0) != '(' { 670 return IdentToken 671 } else if !parse.EqualFold(bytes.Replace(l.r.Lexeme(), []byte{'\\'}, nil, -1), []byte{'u', 'r', 'l'}) { 672 l.r.Move(1) 673 return FunctionToken 674 } 675 l.r.Move(1) 676 677 // consume url 678 for l.consumeWhitespace() { 679 } 680 if c := l.r.Peek(0); c == '"' || c == '\'' { 681 if l.consumeString() == BadStringToken { 682 l.consumeRemnantsBadURL() 683 return BadURLToken 684 } 685 } else if !l.consumeUnquotedURL() && !l.consumeWhitespace() { // if unquoted URL fails due to encountering whitespace, continue 686 l.consumeRemnantsBadURL() 687 return BadURLToken 688 } 689 for l.consumeWhitespace() { 690 } 691 if !l.consumeByte(')') && l.r.Err() != io.EOF { 692 l.consumeRemnantsBadURL() 693 return BadURLToken 694 } 695 return URLToken 696 } 697 return ErrorToken 698} 699