1// Package css is a CSS3 lexer and parser following the specifications at http://www.w3.org/TR/css-syntax-3/.
2package css
3
4// TODO: \uFFFD replacement character for NULL bytes in strings for example, or atleast don't end the string early
5
6import (
7	"bytes"
8	"io"
9	"strconv"
10
11	"github.com/tdewolff/parse/v2"
12)
13
14// TokenType determines the type of token, eg. a number or a semicolon.
15type TokenType uint32
16
17// TokenType values.
18const (
19	ErrorToken TokenType = iota // extra token when errors occur
20	IdentToken
21	FunctionToken  // rgb( rgba( ...
22	AtKeywordToken // @abc
23	HashToken      // #abc
24	StringToken
25	BadStringToken
26	URLToken
27	BadURLToken
28	DelimToken            // any unmatched character
29	NumberToken           // 5
30	PercentageToken       // 5%
31	DimensionToken        // 5em
32	UnicodeRangeToken     // U+554A
33	IncludeMatchToken     // ~=
34	DashMatchToken        // |=
35	PrefixMatchToken      // ^=
36	SuffixMatchToken      // $=
37	SubstringMatchToken   // *=
38	ColumnToken           // ||
39	WhitespaceToken       // space \t \r \n \f
40	CDOToken              // <!--
41	CDCToken              // -->
42	ColonToken            // :
43	SemicolonToken        // ;
44	CommaToken            // ,
45	LeftBracketToken      // [
46	RightBracketToken     // ]
47	LeftParenthesisToken  // (
48	RightParenthesisToken // )
49	LeftBraceToken        // {
50	RightBraceToken       // }
51	CommentToken          // extra token for comments
52	EmptyToken
53	CustomPropertyNameToken
54	CustomPropertyValueToken
55)
56
57// String returns the string representation of a TokenType.
58func (tt TokenType) String() string {
59	switch tt {
60	case ErrorToken:
61		return "Error"
62	case IdentToken:
63		return "Ident"
64	case FunctionToken:
65		return "Function"
66	case AtKeywordToken:
67		return "AtKeyword"
68	case HashToken:
69		return "Hash"
70	case StringToken:
71		return "String"
72	case BadStringToken:
73		return "BadString"
74	case URLToken:
75		return "URL"
76	case BadURLToken:
77		return "BadURL"
78	case DelimToken:
79		return "Delim"
80	case NumberToken:
81		return "Number"
82	case PercentageToken:
83		return "Percentage"
84	case DimensionToken:
85		return "Dimension"
86	case UnicodeRangeToken:
87		return "UnicodeRange"
88	case IncludeMatchToken:
89		return "IncludeMatch"
90	case DashMatchToken:
91		return "DashMatch"
92	case PrefixMatchToken:
93		return "PrefixMatch"
94	case SuffixMatchToken:
95		return "SuffixMatch"
96	case SubstringMatchToken:
97		return "SubstringMatch"
98	case ColumnToken:
99		return "Column"
100	case WhitespaceToken:
101		return "Whitespace"
102	case CDOToken:
103		return "CDO"
104	case CDCToken:
105		return "CDC"
106	case ColonToken:
107		return "Colon"
108	case SemicolonToken:
109		return "Semicolon"
110	case CommaToken:
111		return "Comma"
112	case LeftBracketToken:
113		return "LeftBracket"
114	case RightBracketToken:
115		return "RightBracket"
116	case LeftParenthesisToken:
117		return "LeftParenthesis"
118	case RightParenthesisToken:
119		return "RightParenthesis"
120	case LeftBraceToken:
121		return "LeftBrace"
122	case RightBraceToken:
123		return "RightBrace"
124	case CommentToken:
125		return "Comment"
126	case EmptyToken:
127		return "Empty"
128	case CustomPropertyNameToken:
129		return "CustomPropertyName"
130	case CustomPropertyValueToken:
131		return "CustomPropertyValue"
132	}
133	return "Invalid(" + strconv.Itoa(int(tt)) + ")"
134}
135
136////////////////////////////////////////////////////////////////
137
138// Lexer is the state for the lexer.
139type Lexer struct {
140	r *parse.Input
141}
142
143// NewLexer returns a new Lexer for a given io.Reader.
144func NewLexer(r *parse.Input) *Lexer {
145	return &Lexer{
146		r: r,
147	}
148}
149
150// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
151func (l *Lexer) Err() error {
152	return l.r.Err()
153}
154
155// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
156func (l *Lexer) Next() (TokenType, []byte) {
157	switch l.r.Peek(0) {
158	case ' ', '\t', '\n', '\r', '\f':
159		l.r.Move(1)
160		for l.consumeWhitespace() {
161		}
162		return WhitespaceToken, l.r.Shift()
163	case ':':
164		l.r.Move(1)
165		return ColonToken, l.r.Shift()
166	case ';':
167		l.r.Move(1)
168		return SemicolonToken, l.r.Shift()
169	case ',':
170		l.r.Move(1)
171		return CommaToken, l.r.Shift()
172	case '(', ')', '[', ']', '{', '}':
173		if t := l.consumeBracket(); t != ErrorToken {
174			return t, l.r.Shift()
175		}
176	case '#':
177		if l.consumeHashToken() {
178			return HashToken, l.r.Shift()
179		}
180	case '"', '\'':
181		if t := l.consumeString(); t != ErrorToken {
182			return t, l.r.Shift()
183		}
184	case '.', '+':
185		if t := l.consumeNumeric(); t != ErrorToken {
186			return t, l.r.Shift()
187		}
188	case '-':
189		if t := l.consumeNumeric(); t != ErrorToken {
190			return t, l.r.Shift()
191		} else if t := l.consumeIdentlike(); t != ErrorToken {
192			return t, l.r.Shift()
193		} else if l.consumeCDCToken() {
194			return CDCToken, l.r.Shift()
195		} else if l.consumeCustomVariableToken() {
196			return CustomPropertyNameToken, l.r.Shift()
197		}
198	case '@':
199		if l.consumeAtKeywordToken() {
200			return AtKeywordToken, l.r.Shift()
201		}
202	case '$', '*', '^', '~':
203		if t := l.consumeMatch(); t != ErrorToken {
204			return t, l.r.Shift()
205		}
206	case '/':
207		if l.consumeComment() {
208			return CommentToken, l.r.Shift()
209		}
210	case '<':
211		if l.consumeCDOToken() {
212			return CDOToken, l.r.Shift()
213		}
214	case '\\':
215		if t := l.consumeIdentlike(); t != ErrorToken {
216			return t, l.r.Shift()
217		}
218	case 'u', 'U':
219		if l.consumeUnicodeRangeToken() {
220			return UnicodeRangeToken, l.r.Shift()
221		} else if t := l.consumeIdentlike(); t != ErrorToken {
222			return t, l.r.Shift()
223		}
224	case '|':
225		if t := l.consumeMatch(); t != ErrorToken {
226			return t, l.r.Shift()
227		} else if l.consumeColumnToken() {
228			return ColumnToken, l.r.Shift()
229		}
230	case 0:
231		if l.r.Err() != nil {
232			return ErrorToken, nil
233		}
234	default:
235		if t := l.consumeNumeric(); t != ErrorToken {
236			return t, l.r.Shift()
237		} else if t := l.consumeIdentlike(); t != ErrorToken {
238			return t, l.r.Shift()
239		}
240	}
241	// can't be rune because consumeIdentlike consumes that as an identifier
242	l.r.Move(1)
243	return DelimToken, l.r.Shift()
244}
245
246////////////////////////////////////////////////////////////////
247
248/*
249The following functions follow the railroad diagrams in http://www.w3.org/TR/css3-syntax/
250*/
251
252func (l *Lexer) consumeByte(c byte) bool {
253	if l.r.Peek(0) == c {
254		l.r.Move(1)
255		return true
256	}
257	return false
258}
259
260func (l *Lexer) consumeComment() bool {
261	if l.r.Peek(0) != '/' || l.r.Peek(1) != '*' {
262		return false
263	}
264	l.r.Move(2)
265	for {
266		c := l.r.Peek(0)
267		if c == 0 && l.r.Err() != nil {
268			break
269		} else if c == '*' && l.r.Peek(1) == '/' {
270			l.r.Move(2)
271			return true
272		}
273		l.r.Move(1)
274	}
275	return true
276}
277
278func (l *Lexer) consumeNewline() bool {
279	c := l.r.Peek(0)
280	if c == '\n' || c == '\f' {
281		l.r.Move(1)
282		return true
283	} else if c == '\r' {
284		if l.r.Peek(1) == '\n' {
285			l.r.Move(2)
286		} else {
287			l.r.Move(1)
288		}
289		return true
290	}
291	return false
292}
293
294func (l *Lexer) consumeWhitespace() bool {
295	c := l.r.Peek(0)
296	if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
297		l.r.Move(1)
298		return true
299	}
300	return false
301}
302
303func (l *Lexer) consumeDigit() bool {
304	c := l.r.Peek(0)
305	if c >= '0' && c <= '9' {
306		l.r.Move(1)
307		return true
308	}
309	return false
310}
311
312func (l *Lexer) consumeHexDigit() bool {
313	c := l.r.Peek(0)
314	if (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {
315		l.r.Move(1)
316		return true
317	}
318	return false
319}
320
321func (l *Lexer) consumeEscape() bool {
322	if l.r.Peek(0) != '\\' {
323		return false
324	}
325	mark := l.r.Pos()
326	l.r.Move(1)
327	if l.consumeNewline() {
328		l.r.Rewind(mark)
329		return false
330	} else if l.consumeHexDigit() {
331		for k := 1; k < 6; k++ {
332			if !l.consumeHexDigit() {
333				break
334			}
335		}
336		l.consumeWhitespace()
337		return true
338	} else {
339		c := l.r.Peek(0)
340		if c >= 0xC0 {
341			_, n := l.r.PeekRune(0)
342			l.r.Move(n)
343			return true
344		} else if c == 0 && l.r.Err() != nil {
345			l.r.Rewind(mark)
346			return false
347		}
348	}
349	l.r.Move(1)
350	return true
351}
352
353func (l *Lexer) consumeIdentToken() bool {
354	mark := l.r.Pos()
355	if l.r.Peek(0) == '-' {
356		l.r.Move(1)
357	}
358	c := l.r.Peek(0)
359	if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80) {
360		if c != '\\' || !l.consumeEscape() {
361			l.r.Rewind(mark)
362			return false
363		}
364	} else {
365		l.r.Move(1)
366	}
367	for {
368		c := l.r.Peek(0)
369		if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
370			if c != '\\' || !l.consumeEscape() {
371				break
372			}
373		} else {
374			l.r.Move(1)
375		}
376	}
377	return true
378}
379
380// support custom variables, https://www.w3.org/TR/css-variables-1/
381func (l *Lexer) consumeCustomVariableToken() bool {
382	// expect to be on a '-'
383	l.r.Move(1)
384	if l.r.Peek(0) != '-' {
385		l.r.Move(-1)
386		return false
387	}
388	if !l.consumeIdentToken() {
389		l.r.Move(-1)
390		return false
391	}
392	return true
393}
394
395func (l *Lexer) consumeAtKeywordToken() bool {
396	// expect to be on an '@'
397	l.r.Move(1)
398	if !l.consumeIdentToken() {
399		l.r.Move(-1)
400		return false
401	}
402	return true
403}
404
405func (l *Lexer) consumeHashToken() bool {
406	// expect to be on a '#'
407	mark := l.r.Pos()
408	l.r.Move(1)
409	c := l.r.Peek(0)
410	if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
411		if c != '\\' || !l.consumeEscape() {
412			l.r.Rewind(mark)
413			return false
414		}
415	} else {
416		l.r.Move(1)
417	}
418	for {
419		c := l.r.Peek(0)
420		if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
421			if c != '\\' || !l.consumeEscape() {
422				break
423			}
424		} else {
425			l.r.Move(1)
426		}
427	}
428	return true
429}
430
431func (l *Lexer) consumeNumberToken() bool {
432	mark := l.r.Pos()
433	c := l.r.Peek(0)
434	if c == '+' || c == '-' {
435		l.r.Move(1)
436	}
437	firstDigit := l.consumeDigit()
438	if firstDigit {
439		for l.consumeDigit() {
440		}
441	}
442	if l.r.Peek(0) == '.' {
443		l.r.Move(1)
444		if l.consumeDigit() {
445			for l.consumeDigit() {
446			}
447		} else if firstDigit {
448			// . could belong to the next token
449			l.r.Move(-1)
450			return true
451		} else {
452			l.r.Rewind(mark)
453			return false
454		}
455	} else if !firstDigit {
456		l.r.Rewind(mark)
457		return false
458	}
459	mark = l.r.Pos()
460	c = l.r.Peek(0)
461	if c == 'e' || c == 'E' {
462		l.r.Move(1)
463		c = l.r.Peek(0)
464		if c == '+' || c == '-' {
465			l.r.Move(1)
466		}
467		if !l.consumeDigit() {
468			// e could belong to next token
469			l.r.Rewind(mark)
470			return true
471		}
472		for l.consumeDigit() {
473		}
474	}
475	return true
476}
477
478func (l *Lexer) consumeUnicodeRangeToken() bool {
479	c := l.r.Peek(0)
480	if (c != 'u' && c != 'U') || l.r.Peek(1) != '+' {
481		return false
482	}
483	mark := l.r.Pos()
484	l.r.Move(2)
485
486	// consume up to 6 hexDigits
487	k := 0
488	for l.consumeHexDigit() {
489		k++
490	}
491
492	// either a minus or a question mark or the end is expected
493	if l.consumeByte('-') {
494		if k == 0 || 6 < k {
495			l.r.Rewind(mark)
496			return false
497		}
498
499		// consume another up to 6 hexDigits
500		if l.consumeHexDigit() {
501			k = 1
502			for l.consumeHexDigit() {
503				k++
504			}
505		} else {
506			l.r.Rewind(mark)
507			return false
508		}
509	} else if l.consumeByte('?') {
510		// could be filled up to 6 characters with question marks or else regular hexDigits
511		k++
512		for l.consumeByte('?') {
513			k++
514		}
515	}
516	if k == 0 || 6 < k {
517		l.r.Rewind(mark)
518		return false
519	}
520	return true
521}
522
523func (l *Lexer) consumeColumnToken() bool {
524	if l.r.Peek(0) == '|' && l.r.Peek(1) == '|' {
525		l.r.Move(2)
526		return true
527	}
528	return false
529}
530
531func (l *Lexer) consumeCDOToken() bool {
532	if l.r.Peek(0) == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
533		l.r.Move(4)
534		return true
535	}
536	return false
537}
538
539func (l *Lexer) consumeCDCToken() bool {
540	if l.r.Peek(0) == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
541		l.r.Move(3)
542		return true
543	}
544	return false
545}
546
547////////////////////////////////////////////////////////////////
548
549// consumeMatch consumes any MatchToken.
550func (l *Lexer) consumeMatch() TokenType {
551	if l.r.Peek(1) == '=' {
552		switch l.r.Peek(0) {
553		case '~':
554			l.r.Move(2)
555			return IncludeMatchToken
556		case '|':
557			l.r.Move(2)
558			return DashMatchToken
559		case '^':
560			l.r.Move(2)
561			return PrefixMatchToken
562		case '$':
563			l.r.Move(2)
564			return SuffixMatchToken
565		case '*':
566			l.r.Move(2)
567			return SubstringMatchToken
568		}
569	}
570	return ErrorToken
571}
572
573// consumeBracket consumes any bracket token.
574func (l *Lexer) consumeBracket() TokenType {
575	switch l.r.Peek(0) {
576	case '(':
577		l.r.Move(1)
578		return LeftParenthesisToken
579	case ')':
580		l.r.Move(1)
581		return RightParenthesisToken
582	case '[':
583		l.r.Move(1)
584		return LeftBracketToken
585	case ']':
586		l.r.Move(1)
587		return RightBracketToken
588	case '{':
589		l.r.Move(1)
590		return LeftBraceToken
591	case '}':
592		l.r.Move(1)
593		return RightBraceToken
594	}
595	return ErrorToken
596}
597
598// consumeNumeric consumes NumberToken, PercentageToken or DimensionToken.
599func (l *Lexer) consumeNumeric() TokenType {
600	if l.consumeNumberToken() {
601		if l.consumeByte('%') {
602			return PercentageToken
603		} else if l.consumeIdentToken() {
604			return DimensionToken
605		}
606		return NumberToken
607	}
608	return ErrorToken
609}
610
611// consumeString consumes a string and may return BadStringToken when a newline is encountered.
612func (l *Lexer) consumeString() TokenType {
613	// assume to be on " or '
614	delim := l.r.Peek(0)
615	l.r.Move(1)
616	for {
617		c := l.r.Peek(0)
618		if c == 0 && l.r.Err() != nil {
619			break
620		} else if c == '\n' || c == '\r' || c == '\f' {
621			l.r.Move(1)
622			return BadStringToken
623		} else if c == delim {
624			l.r.Move(1)
625			break
626		} else if c == '\\' {
627			if !l.consumeEscape() {
628				// either newline or EOF after backslash
629				l.r.Move(1)
630				l.consumeNewline()
631			}
632		} else {
633			l.r.Move(1)
634		}
635	}
636	return StringToken
637}
638
639func (l *Lexer) consumeUnquotedURL() bool {
640	for {
641		c := l.r.Peek(0)
642		if c == 0 && l.r.Err() != nil || c == ')' {
643			break
644		} else if c == '"' || c == '\'' || c == '(' || c == '\\' || c == ' ' || c <= 0x1F || c == 0x7F {
645			if c != '\\' || !l.consumeEscape() {
646				return false
647			}
648		} else {
649			l.r.Move(1)
650		}
651	}
652	return true
653}
654
655// consumeRemnantsBadUrl consumes bytes of a BadUrlToken so that normal tokenization may continue.
656func (l *Lexer) consumeRemnantsBadURL() {
657	for {
658		if l.consumeByte(')') || l.r.Err() != nil {
659			break
660		} else if !l.consumeEscape() {
661			l.r.Move(1)
662		}
663	}
664}
665
666// consumeIdentlike consumes IdentToken, FunctionToken or UrlToken.
667func (l *Lexer) consumeIdentlike() TokenType {
668	if l.consumeIdentToken() {
669		if l.r.Peek(0) != '(' {
670			return IdentToken
671		} else if !parse.EqualFold(bytes.Replace(l.r.Lexeme(), []byte{'\\'}, nil, -1), []byte{'u', 'r', 'l'}) {
672			l.r.Move(1)
673			return FunctionToken
674		}
675		l.r.Move(1)
676
677		// consume url
678		for l.consumeWhitespace() {
679		}
680		if c := l.r.Peek(0); c == '"' || c == '\'' {
681			if l.consumeString() == BadStringToken {
682				l.consumeRemnantsBadURL()
683				return BadURLToken
684			}
685		} else if !l.consumeUnquotedURL() && !l.consumeWhitespace() { // if unquoted URL fails due to encountering whitespace, continue
686			l.consumeRemnantsBadURL()
687			return BadURLToken
688		}
689		for l.consumeWhitespace() {
690		}
691		if !l.consumeByte(')') && l.r.Err() != io.EOF {
692			l.consumeRemnantsBadURL()
693			return BadURLToken
694		}
695		return URLToken
696	}
697	return ErrorToken
698}
699